Loading library and dataset
!pip install xgboost
!pip install feature_engine
!pip install delayed
import sklearn.cluster
# IMPORT ALL NEEDED LIBRARIES
from sklearn.metrics.cluster import rand_score
import numpy as np
import pandas as pd
import os
import time
import statistics
import seaborn as sns
from numpy import unique
from numpy import where
# data visualization
import matplotlib.pyplot as plt
# methods for data preparation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# evaluation metrics SUPERVISED
from sklearn.metrics import (
accuracy_score,
precision_score,
recall_score,
confusion_matrix,
classification_report,
f1_score
)
# evaluation metrics UNSUPERVISED
from sklearn.metrics import (
homogeneity_score,
completeness_score,
v_measure_score,
silhouette_score,
calinski_harabasz_score,
rand_score,
adjusted_rand_score,
fowlkes_mallows_score
)
# CV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
# distribution probabilities
from scipy.stats import uniform, randint
from feature_engine.encoding import CountFrequencyEncoder
# supervised
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
# unsupervised
from sklearn.datasets import make_classification
from sklearn.mixture import GaussianMixture as GMM
from sklearn.cluster import (
KMeans,
AgglomerativeClustering,
DBSCAN,
Birch
)
# to plot dendogram for Agglomerative Clustering
from scipy.cluster.hierarchy import dendrogram
# in order to avoid the visualisation of some warning messages
import warnings
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning) # Command not to print warning messages while plotting boxplots
warnings.simplefilter(action="ignore", category=FutureWarning) # Command not to print warning messages while plotting
Requirement already satisfied: xgboost in c:\users\ester\anaconda3\lib\site-packages (1.5.1) Requirement already satisfied: scipy in c:\users\ester\anaconda3\lib\site-packages (from xgboost) (1.6.2) Requirement already satisfied: numpy in c:\users\ester\anaconda3\lib\site-packages (from xgboost) (1.20.1) Requirement already satisfied: feature_engine in c:\users\ester\anaconda3\lib\site-packages (1.1.2) Requirement already satisfied: scikit-learn>=0.22.2 in c:\users\ester\anaconda3\lib\site-packages (from feature_engine) (0.24.1) Requirement already satisfied: numpy>=1.18.2 in c:\users\ester\anaconda3\lib\site-packages (from feature_engine) (1.20.1) Requirement already satisfied: scipy>=1.4.1 in c:\users\ester\anaconda3\lib\site-packages (from feature_engine) (1.6.2) Requirement already satisfied: pandas>=1.0.3 in c:\users\ester\anaconda3\lib\site-packages (from feature_engine) (1.2.4) Requirement already satisfied: statsmodels>=0.11.1 in c:\users\ester\anaconda3\lib\site-packages (from feature_engine) (0.12.2) Requirement already satisfied: pytz>=2017.3 in c:\users\ester\anaconda3\lib\site-packages (from pandas>=1.0.3->feature_engine) (2021.1) Requirement already satisfied: python-dateutil>=2.7.3 in c:\users\ester\anaconda3\lib\site-packages (from pandas>=1.0.3->feature_engine) (2.8.1) Requirement already satisfied: six>=1.5 in c:\users\ester\anaconda3\lib\site-packages (from python-dateutil>=2.7.3->pandas>=1.0.3->feature_engine) (1.15.0) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\ester\anaconda3\lib\site-packages (from scikit-learn>=0.22.2->feature_engine) (2.1.0) Requirement already satisfied: joblib>=0.11 in c:\users\ester\anaconda3\lib\site-packages (from scikit-learn>=0.22.2->feature_engine) (1.0.1) Requirement already satisfied: patsy>=0.5 in c:\users\ester\anaconda3\lib\site-packages (from statsmodels>=0.11.1->feature_engine) (0.5.1) Requirement already satisfied: delayed in c:\users\ester\anaconda3\lib\site-packages (0.11.0b1) Requirement already satisfied: redis in c:\users\ester\anaconda3\lib\site-packages (from delayed) (4.0.2) Requirement already satisfied: hiredis in c:\users\ester\anaconda3\lib\site-packages (from delayed) (2.0.0) Requirement already satisfied: deprecated in c:\users\ester\anaconda3\lib\site-packages (from redis->delayed) (1.2.13) Requirement already satisfied: wrapt<2,>=1.10 in c:\users\ester\anaconda3\lib\site-packages (from deprecated->redis->delayed) (1.12.1)
# Importing dataset into Pandas dataframe
df = pd.read_csv ('../../Desktop/db/malware_train_10pc.csv', sep= ',')
# Creation of a dataframe where we keep track of all the result obtained with the different algorithm+
result = pd.DataFrame(index=["Accuracy", "Precision", "Recall","F1"]) # Empty df to fill with metrics of different models trained along the notebook
# retrive original values from encoded and standardized ones
def retr(column):
l = []
l_index = []
for i in range(len(df)):
p = []
if df_comparison[column].iloc[i] not in l_index:
p.append(df_comparison[column].iloc[i])
l_index.append(df_comparison[column].iloc[i])
p.append(df[column].iloc[i])
l.append(p)
else:
continue
return pd.DataFrame(l, columns = ['real', column])
# encoding function
def retrive_values_enc(columns, method='frequency'):
for i in columns:
df[i]= df[i].astype(str)
# set up the encoder
encoder = CountFrequencyEncoder(encoding_method='frequency',
variables=columns)
# fit the encoder
encoder.fit(df[columns])
df_enc = encoder.transform(df[columns])
for i in columns:
df[i] = df_enc[i]
return encoder.encoder_dict_
# FUNCTION FOR STATISTICS
def max_min_mean_med (df, column):
print("The max of %s is: "%column ,max(df[column])) # MAX
print("\nThe min of %s is: "%column,min(df[column])) # MIN
print("\nThe mean of %s is: "%column,np.mean(df[column])) # MEAN
print("\nThe median of %s is: "%column,statistics.median(df[column])) # MEDIAN
# Function for emphasize the cell with the higher value
def highlight_max(s, props=''):
return np.where(s == np.nanmax(s.values), props, '')
These series of functions were implented to manage the outliers.
# Definition of functions to count outliers + generate boxplot + delete outliers
# Generate boxpot
def get_iqr_values(df_in, col_name):
median = df_in[col_name].median()
q1 = df_in[col_name].quantile(0.25) # 25th percentile / 1st quartile
q3 = df_in[col_name].quantile(0.75) # 7th percentile / 3rd quartile
iqr = q3-q1 # Interquartile range
minimum = q1-1.5*iqr # The minimum value or the |- marker in the box plot
maximum = q3+1.5*iqr # The maximum value or the -| marker in the box plot
return median, q1, q3, iqr, minimum, maximum
# Count outliers
def count_outliers(df_in, col_name, iqr_):
_, _, _, _, minimum, maximum = iqr_
df_outliers = df_in.loc[(df_in[col_name] <= minimum) | (df_in[col_name] >= maximum)]
return df_outliers.shape[0]
# Delete outliers
def remove_outliers(df_in, col_name, iqr_):
_, _, _, _, minimum, maximum = iqr_
df_out = df_in.loc[(df_in[col_name] > minimum) & (df_in[col_name] < maximum)]
return df_out
# Frequency encoding
# Here the function freq_enc permits to transform the cathegorical columns indicated with frequency encoding in the dataframe
def freq_enc (df, list_cat):
for i in list_cat:
count_map = df[i].value_counts().to_dict()
df[i] = df[i].map(count_map)
return df
# Performs k-fold cross validation
def cross_val (model, X_train, y_train):
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=3)
s_r = ('Accuracy: %.3f (%.3f)' % (scores.mean(), scores.std()))
return s_r
# Function for dropping p_value higher than 0.05-> meaning that the variable is no relevant for the analysis
def p_value_drop(df, p_values):
# given the starting df we drop features where p_value > 0.05
list_to_drop = []
for i in range(len(p_values)):
if p_values[i]>0.05:
list_to_drop.append(X.columns[i])
df_new = df.drop(columns=list_to_drop)
print(list_to_drop)
return df_new
• How many rows and how many columns are there in the data?
print("Number of rows: ",df.shape[0]) # PRINT THE NUMBER OF ROWS
print("Number of columns: ",df.shape[1]) # PRINT THE NUMBER OF COLUMNS
Number of rows: 893687 Number of columns: 83
• What are the names and datatypes in each column?
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 893687 entries, 0 to 893686 Data columns (total 83 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MachineIdentifier 893687 non-null object 1 ProductName 893687 non-null object 2 EngineVersion 893687 non-null object 3 AppVersion 893687 non-null object 4 AvSigVersion 893687 non-null object 5 IsBeta 893687 non-null int64 6 RtpStateBitfield 890511 non-null float64 7 IsSxsPassiveMode 893687 non-null int64 8 DefaultBrowsersIdentifier 43456 non-null float64 9 AVProductStatesIdentifier 889999 non-null float64 10 AVProductsInstalled 889999 non-null float64 11 AVProductsEnabled 889999 non-null float64 12 HasTpm 893687 non-null int64 13 CountryIdentifier 893687 non-null int64 14 CityIdentifier 860844 non-null float64 15 OrganizationIdentifier 617784 non-null float64 16 GeoNameIdentifier 893664 non-null float64 17 LocaleEnglishNameIdentifier 893687 non-null int64 18 Platform 893687 non-null object 19 Processor 893687 non-null object 20 OsVer 893687 non-null object 21 OsBuild 893687 non-null int64 22 OsSuite 893687 non-null int64 23 OsPlatformSubRelease 893687 non-null object 24 OsBuildLab 893686 non-null object 25 SkuEdition 893687 non-null object 26 IsProtected 890020 non-null float64 27 AutoSampleOptIn 893687 non-null int64 28 PuaMode 237 non-null object 29 SMode 840037 non-null float64 30 IeVerIdentifier 887778 non-null float64 31 SmartScreen 576167 non-null object 32 Firewall 884617 non-null float64 33 UacLuaenable 892573 non-null float64 34 Census_MDC2FormFactor 893687 non-null object 35 Census_DeviceFamily 893687 non-null object 36 Census_OEMNameIdentifier 884120 non-null float64 37 Census_OEMModelIdentifier 883518 non-null float64 38 Census_ProcessorCoreCount 889615 non-null float64 39 Census_ProcessorManufacturerIdentifier 889614 non-null float64 40 Census_ProcessorModelIdentifier 889611 non-null float64 41 Census_ProcessorClass 3663 non-null object 42 Census_PrimaryDiskTotalCapacity 888417 non-null float64 43 Census_PrimaryDiskTypeName 892376 non-null object 44 Census_SystemVolumeTotalCapacity 888419 non-null float64 45 Census_HasOpticalDiskDrive 893687 non-null int64 46 Census_TotalPhysicalRAM 885757 non-null float64 47 Census_ChassisTypeName 893618 non-null object 48 Census_InternalPrimaryDiagonalDisplaySizeInInches 889013 non-null float64 49 Census_InternalPrimaryDisplayResolutionHorizontal 889025 non-null float64 50 Census_InternalPrimaryDisplayResolutionVertical 889025 non-null float64 51 Census_PowerPlatformRoleName 893683 non-null object 52 Census_InternalBatteryType 258928 non-null object 53 Census_InternalBatteryNumberOfCharges 867183 non-null float64 54 Census_OSVersion 893687 non-null object 55 Census_OSArchitecture 893687 non-null object 56 Census_OSBranch 893687 non-null object 57 Census_OSBuildNumber 893687 non-null int64 58 Census_OSBuildRevision 893687 non-null int64 59 Census_OSEdition 893687 non-null object 60 Census_OSSkuName 893687 non-null object 61 Census_OSInstallTypeName 893687 non-null object 62 Census_OSInstallLanguageIdentifier 887576 non-null float64 63 Census_OSUILocaleIdentifier 893687 non-null int64 64 Census_OSWUAutoUpdateOptionsName 893687 non-null object 65 Census_IsPortableOperatingSystem 893687 non-null int64 66 Census_GenuineStateName 893687 non-null object 67 Census_ActivationChannel 893687 non-null object 68 Census_IsFlightingInternal 151694 non-null float64 69 Census_IsFlightsDisabled 877727 non-null float64 70 Census_FlightRing 893687 non-null object 71 Census_ThresholdOptIn 326729 non-null float64 72 Census_FirmwareManufacturerIdentifier 875389 non-null float64 73 Census_FirmwareVersionIdentifier 877657 non-null float64 74 Census_IsSecureBootEnabled 893687 non-null int64 75 Census_IsWIMBootEnabled 327481 non-null float64 76 Census_IsVirtualDevice 892095 non-null float64 77 Census_IsTouchEnabled 893687 non-null int64 78 Census_IsPenCapable 893687 non-null int64 79 Census_IsAlwaysOnAlwaysConnectedCapable 886633 non-null float64 80 Wdft_IsGamer 863247 non-null float64 81 Wdft_RegionIdentifier 863247 non-null float64 82 HasDetections 893687 non-null int64 dtypes: float64(36), int64(17), object(30) memory usage: 565.9+ MB
• What percentage of computers are infected?
print("Percentage of infected computers: " + str(len(df[df['HasDetections']==1]) / df.shape[0] * 100) + "%")
Percentage of infected computers: 50.00721729196016%
• What percentage of computers have touch screens enabled?
print("Percentage of computers with touch screen enabled: " + str(len(df[df['Census_IsTouchEnabled']==1]) / df.shape[0] * 100) + "%")
Percentage of computers with touch screen enabled: 12.539289482783122%
• What percentage of computers have solid-state hard drives?
print("Percentage of computers with solid-state hard drives: " + str(len(df[df['Census_HasOpticalDiskDrive']==0]) / df.shape[0] * 100) + "%")
Percentage of computers with solid-state hard drives: 92.21897599495125%
• What percentage of computers are gaming machines?
print("Percentage of gaming machines: " + str(len(df[df['Wdft_IsGamer']==1]) / df.shape[0] * 100) + "%")
Percentage of gaming machines: 27.38106294485653%
• What percentage of computers have a firewall enabled?
print("Percentage of computers with firewall enabled: " + str(len(df[df['Firewall']==1]) / df.shape[0] * 100) + "%")
Percentage of computers with firewall enabled: 96.84721832140335%
• What is the max/min/mean/median processor count?
max_min_mean_med(df,'Census_ProcessorCoreCount')
The max of Census_ProcessorCoreCount is: 128.0 The min of Census_ProcessorCoreCount is: 1.0 The mean of Census_ProcessorCoreCount is: 3.9915772553295525 The median of Census_ProcessorCoreCount is: 8.0
• What is the max/min/mean/median RAM on the machines?
max_min_mean_med(df,'Census_TotalPhysicalRAM')
The max of Census_TotalPhysicalRAM is: 524288.0 The min of Census_TotalPhysicalRAM is: 512.0 The mean of Census_TotalPhysicalRAM is: 6114.022286022013 The median of Census_TotalPhysicalRAM is: 3072.0
• What is the max/min/mean/median display size in inches?
max_min_mean_med(df, 'Census_InternalPrimaryDiagonalDisplaySizeInInches')
The max of Census_InternalPrimaryDiagonalDisplaySizeInInches is: 142.0 The min of Census_InternalPrimaryDiagonalDisplaySizeInInches is: 3.5 The mean of Census_InternalPrimaryDiagonalDisplaySizeInInches is: 16.673138525550353 The median of Census_InternalPrimaryDiagonalDisplaySizeInInches is: 13.9
• How many countries and cities are there in the dataset?
print("Number of countries: ", len(df["CountryIdentifier"].unique())) # PRINT NUMBER OF COUNTRIES
print("Number of cities: ", len(df["CityIdentifier"].unique())) # PRINT NUMBER OF CITIES
Number of countries: 222 Number of cities: 48403
**COMMENT**:
As far we have seen is that the dataset is balaced according to the label, since the percentage of devices infected is 50%, so we can go on with our analysis without implement any oversampling or downsampling. We also noticed that some variables have some MAX values much higher than the mean of the column, so we are going to meet lots outliers.
For instances:
Visualise the distributions contained in the various columns:
After an analysis of the dataset we noticed that there were only 8 real numerical columns, all the other were categorical features, some of those in particular, were binary.
numerical_columns = [ # many of int and float variables resulted to be
'Census_ProcessorCoreCount', # identifier and so we considered them as categorical
'Census_PrimaryDiskTotalCapacity', # variables
'Census_SystemVolumeTotalCapacity',
'Census_TotalPhysicalRAM',
'Census_InternalPrimaryDiagonalDisplaySizeInInches',
'Census_InternalPrimaryDisplayResolutionHorizontal',
'Census_InternalPrimaryDisplayResolutionVertical',
'Census_InternalBatteryNumberOfCharges'
]
binary_columns = [j for j in df.columns if df[j].nunique() == 2]
categorical_columns = [y for y in df.columns if (y not in numerical_columns) & (y not in binary_columns)]
• Create bar plots and histograms for various features. Are there any obvious outliers?
An outlier for a numerical features is number particular distant or bigger than the others
While there is no concept of outliers detection in categorical variables(nominal), as each value is count as labels. Based on frequency(Mode), we can't do outliers treatment for categorical variables.
So we generate a bar plot for binary features to compare the distribution in the value with HasDetections == 1 and HasDetections == 0 (= NotHasDetections)
for i in numerical_columns:
tmp_df = df.groupby(i).size().reset_index()
plt.hist(tmp_df[i], color='#86bf91', zorder=2, rwidth=0.9, density=False, bins=20) # density=False would make counts
plt.xlabel(i)
plt.show()
**COMMENT**: From these histograms we can see that:
have some outliers in their graphs, but we can see them even better with the boxplot</font>
df_infected = df[df['HasDetections']==1]
df_not_infected = df[df['HasDetections']==0]
for i in numerical_columns:
fig, ax = plt.subplots(figsize=(14,8))
ax.hist([df_not_infected[i], df_infected[i]], zorder=2, rwidth=0.9, density=False, bins=20, label=("NoHasDetections", "HasDetections"))
ax.set_title(i)
ax.legend()
**COMMENT**:
Here we can see that the distributionof the numerical variables is almost the same accordign to the label, only in:
we can see some slightly differences among the value that these diffferent columns have.</font>
• Plot the frequency and mean statistics mentioned above for the infected and not-infected machines. Are there differences in the statistics across the two groups? What does that mean?
binary = binary_columns
binary.remove('HasDetections')
for i in binary[:-1]:
tmp_df1 = df_infected.groupby(i).size().reset_index()
tmp_df2 = df_not_infected.groupby(i).size().reset_index()
fig, ax = plt.subplots(1, 2, figsize=(15, 8))
barlist1 = ax[0].bar(tmp_df1[i], tmp_df1[0]) # Bar plot for NotHasDetections values
ax[0].set_xlabel(i +'\n NotHasDetections ')
barlist1[0].set_color('r') # for the value 0 set the color red
barlist1[1].set_color('g') # for the value 1 set the color green
colors = {0: 'red', 1:'green'} # define the color
labels = list(colors.keys())
handles = [plt.Rectangle((0,0),1,1, color=colors[label]) for label in labels]
ax[0].legend(handles, labels) # pu a legend on the graph
barlist2 = ax[1].bar(tmp_df2[i], tmp_df2[0], label = 'HasDetections')
barlist2[0].set_color('r')
barlist2[1].set_color('g')
ax[1].set_xlabel(i +'\n HasDetections ')
colors = {0: 'red', 1:'green'}
labels = list(colors.keys())
handles = [plt.Rectangle((0,0),1,1, color=colors[label]) for label in labels]
ax[1].legend(handles, labels)
plt.show()
**COMMENT**:
Here we did the same analysis of before, but with the binary variables. Also in this case there are no big difference among the columns, but an interesting thing that we can notice is that some of this variable are composed of almost just one label, the occurence of the labels is no balcend in the different columns (VARIABILITY PROBLEM). This happens in:
For more details see in Dealing with low variability and Null values
percent = pd.DataFrame(index=["PercentageInfected", "PercentageNotInfected"])
def dataframe_percentage (df_inf,df_no_inf, column):
percent[column] = [len(df_inf[df_inf[column]==1])/df_inf.shape[0]*100, len(df_no_inf[df_no_inf[column]==1])/df_no_inf.shape[0]*100]
for i in binary:
dataframe_percentage(df_infected,df_not_infected, i)
percent.T.style.apply(highlight_max, props='color:white; background-color:purple;', axis=1)
# axis=1 so in this way we find the higher percentage for each row
| PercentageInfected | PercentageNotInfected | |
|---|---|---|
| IsBeta | 0.000671 | 0.000448 |
| IsSxsPassiveMode | 1.293779 | 2.151847 |
| HasTpm | 98.868671 | 98.756208 |
| IsProtected | 95.565978 | 92.807630 |
| AutoSampleOptIn | 0.004475 | 0.002238 |
| SMode | 0.013202 | 0.070505 |
| Firewall | 96.888621 | 96.805803 |
| Census_HasOpticalDiskDrive | 8.342657 | 7.219229 |
| Census_IsPortableOperatingSystem | 0.063995 | 0.049018 |
| Census_IsFlightingInternal | 0.000224 | 0.000224 |
| Census_IsFlightsDisabled | 0.000224 | 0.002014 |
| Census_ThresholdOptIn | 0.007832 | 0.008281 |
| Census_IsSecureBootEnabled | 48.419809 | 48.687382 |
| Census_IsVirtualDevice | 0.280371 | 1.148890 |
| Census_IsTouchEnabled | 11.158001 | 13.920977 |
| Census_IsPenCapable | 3.454850 | 4.126649 |
| Census_IsAlwaysOnAlwaysConnectedCapable | 4.205787 | 7.159916 |
| Wdft_IsGamer | 29.690451 | 25.071008 |
**COMMENT**:
In this dataframe where we highlight the higher percentage of each rows, there are no much difference. The more relevant are in:
In particular we can see that beeing a Gamer increase a little bit the chance to be infected, that make sense because different games where place for several malware attacks in the years. For all the other mentioned in the list is the opposite, so having a touch device or pen capable device increase the possibility to not be infected, but the difference is no so relevant. </font>
max_min = pd.DataFrame(index=["MAX", "MIN", "MEAN","MEDIAN"])
def dataframe_max_min (df, column):
max_min[column] = [max(df[column]),min(df[column]),np.mean(df[column]),statistics.median(df[column])]
dataframe_max_min(df_infected, 'Census_ProcessorCoreCount')
dataframe_max_min(df_infected, 'Census_TotalPhysicalRAM')
dataframe_max_min(df_infected, 'Census_InternalPrimaryDiagonalDisplaySizeInInches')
dataframe_max_min(df_infected, 'Census_SystemVolumeTotalCapacity')
max_min.T.add_prefix('HasDetections_')
| HasDetections_MAX | HasDetections_MIN | HasDetections_MEAN | HasDetections_MEDIAN | |
|---|---|---|---|---|
| Census_ProcessorCoreCount | 88.0 | 1.0 | 4.104323 | 4.0 |
| Census_TotalPhysicalRAM | 524288.0 | 512.0 | 6407.689206 | 6144.0 |
| Census_InternalPrimaryDiagonalDisplaySizeInInches | 142.0 | 3.9 | 16.879251 | 18.9 |
| Census_SystemVolumeTotalCapacity | 26707676.0 | 0.0 | 381760.982961 | 456406.0 |
dataframe_max_min(df_not_infected, 'Census_ProcessorCoreCount')
dataframe_max_min(df_not_infected, 'Census_TotalPhysicalRAM')
dataframe_max_min(df_not_infected, 'Census_InternalPrimaryDiagonalDisplaySizeInInches')
dataframe_max_min(df_not_infected, 'Census_SystemVolumeTotalCapacity')
max_min.T.add_prefix('NotHasDetections_')
| NotHasDetections_MAX | NotHasDetections_MIN | NotHasDetections_MEAN | NotHasDetections_MEDIAN | |
|---|---|---|---|---|
| Census_ProcessorCoreCount | 128.0 | 1.0 | 3.878897 | 8.0 |
| Census_TotalPhysicalRAM | 524288.0 | 512.0 | 5819.963220 | 4096.0 |
| Census_InternalPrimaryDiagonalDisplaySizeInInches | 142.0 | 3.5 | 16.466988 | 19.2 |
| Census_SystemVolumeTotalCapacity | 11445719.0 | 9214.0 | 372431.424551 | 935564.0 |
print('Countries and cities: ')
print("\nNumber of countries where machine HasDetections: ", len(df_infected["CountryIdentifier"].unique())) # PRINT NUMBER OF COUNTRIES
print("Number of cities where machine HasDetections: ", len(df_infected["CityIdentifier"].unique())) # PRINT NUMBER OF CITIES
print("\n\nNumber of countries where machine NotHasDetections: ", len(df_not_infected["CountryIdentifier"].unique())) # PRINT NUMBER OF COUNTRIES
print("Number of cities where machine NotHasDetections: ", len(df_not_infected["CityIdentifier"].unique()))
Countries and cities: Number of countries where machine HasDetections: 222 Number of cities where machine HasDetections: 34748 Number of countries where machine NotHasDetections: 222 Number of cities where machine NotHasDetections: 35622
**COMMENT**:
From the dataframe of MaxMinMeanMedian we can see that some variables are distribuited almost in the same way as InternalPrimaryDiagonalDisplaySizeInInches.
Also in TotalPhysicalRAM and ProcessorCoreCount there is a very little difference, that does not highlight anything in particular.
While in SystemVolumeTotalCapacity we have a very big difference in almost all the metrics, showing that who has very big or extremely low volume total capacity is more probable to be infected, all the value in the between are in the frame of NotHasDetections. Neverthenless the distribution in all the variables group by label is always the same, if it is positive/negative in one group is positive/negative also in the other.</font>
• Plot a heatmap of the correlations. Which features are correlated with one another? Do the correlations make sense?
We will look at the heatmap of the numerical columns, since the heatmap make sense only with those type of features.
Usually to look at the correlation among categorical features is used the Chi-squared test instead.
corr_df = df[numerical_columns].corr(method='pearson')
plt.figure(figsize=(10, 10))
heatmap = sns.heatmap(corr_df, annot=True)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12)
plt.show()
**COMMENT**:
As we can see, there is a very high correlation among InternalPrimaryResolutionHorizontal -- InternalPrimaryResolutionVertical. That makes totaly sense since this two features deal with the same information. Also TotalPhysicalRAM -- InternalPrimaryDiagonalDisplaySizeInches are highly correlated, this because usually device with big RAM have also bigger size.</font>
For performing our analysis we decide to follow this kind of flow:
Data Preparation: in this phase (that is cover by the followwing part of code), we tried to fix all the problems found in the previous results, applying filter and other kind of Features Selection (PCA and RandomForest)
Since in the initial dataset we had:
After that we went on implementing the different algorithm and at the end evaluating the result otained.</font>

FLOW: we chose to start the dataset cleaning by removing low variability columns and the one ones with high NaN values in way to drop since the begin the less significant columns avoiding to perform in next steps useless computational tasks.
NULL VALUES : Moreover some columns (both categorical(including binary ones) and numerical) had more than 98% of the values as Null/NaN values, so we decided to eliminate them.
LOW VARIABILITY : Looking at the dataset we noticed that some categorical features were composed mainly by just one category, so we decided to delete those categorical columns where the frequency of just one category was more than 98,5% (cover almost all the values of the column) we decided to delete them.
# CHECKING FOR EACH COLUMNS THE PERCENTAGE OF NAN VALUES
for i in df.columns:
s = df[i].isna().sum()
treshold = s/len(df[i])*100
if treshold > 98:
print(i,":",str(treshold),"%\n")
PuaMode : 99.97348064814638 % Census_ProcessorClass : 99.59012495426252 %
# Drop the features with more than 98% of NaN/Null
drop_col = ["PuaMode",
"Census_ProcessorClass"]
df = df.drop(columns=drop_col)
# CHECKING FOR EACH COLUMNS THE VARIABILITY OF CLASSES
for i in df.columns:
tmp_df = df.groupby(i).size().reset_index()
pctg = max(tmp_df[0])/(sum(tmp_df[0]))*100
if pctg > 98.5:
print(i)
print(pctg,"% \n")
ProductName 98.95007983779556 % IsBeta 99.99944052000309 % HasTpm 98.8124477585553 % AutoSampleOptIn 99.99664312001852 % SMode 99.95547815155761 % UacLuaenable 99.39657596633553 % Census_DeviceFamily 99.83070135293453 % Census_IsPortableOperatingSystem 99.94349252031192 % Census_IsFlightingInternal 99.99868155629096 % Census_IsFlightsDisabled 99.99886069358695 % Census_ThresholdOptIn 99.9779633886187 % Census_IsWIMBootEnabled 100.0 % Census_IsVirtualDevice 99.28415695637797 %
# Drop features with low variability (threshold = 0.98)
drop_col = ["MachineIdentifier",
"ProductName",
"IsBeta",
"HasTpm",
"AutoSampleOptIn",
"SMode",
"UacLuaenable",
"Census_DeviceFamily",
"Census_IsPortableOperatingSystem",
"Census_IsFlightingInternal",
"Census_IsFlightsDisabled",
"Census_ThresholdOptIn",
"Census_IsWIMBootEnabled",
"Census_IsVirtualDevice"]
df = df.drop(columns=drop_col, axis = 1)
# Define lists for binary columns and categorical columns
binary_columns = [j for j in df.columns if df[j].nunique() == 2]
categorical_columns = [y for y in df.columns if (y not in numerical_columns) & (y not in binary_columns)]
In this section we performed a reshape of some values in "SmartScreen" and "Census_InternalBatteryType". They used to have strange format values, but also different format for the same meaning value so we uniformed those in a more understandable way.
# Transform strange format values of some features to more clear data
trans_dict = {
'off': 'Off', '': '2', '': '1', 'on': 'On', 'requireadmin': 'RequireAdmin', 'OFF': 'Off',
'Promt': 'Prompt', 'requireAdmin': 'RequireAdmin', 'prompt': 'Prompt', 'warn': 'Warn',
'00000000': '0', '': '3'
}
trans_dict_1 = {
'˙˙˙': 'unknown', 'unkn': 'unknown'
}
df.replace({'SmartScreen': trans_dict}, inplace=True)
df.replace({'Census_InternalBatteryType': trans_dict_1}, inplace=True)
If we would have deleted all the Null or NaN values more than half of our rows would have gone. So we decided to do the predictions of those missing values.
FLOW: we started by replacing Nan values of categorical features with the mode of each feature. We also performd to add a new binary column called "Replaced" to insert a flag/weight for the rows where there were at least one Nan values from categorical features.
# Replace Nan values of categorical features
df_r1=df
df_r1["Replaced"]=0
for i in categorical_columns:
# 1. add new column and replace if category is null then 1 else 0
df_r1["Replaced"] = np.where((df_r1[i].isnull()) | df_r1["Replaced"]==1,1,0)
# 2 Take most occured category in that vairable (.mode())
Mode_Category = df_r1[i].mode()[0]
# 3 Replace NAN values with most occured category in actual vairable
df_r1[i].fillna(Mode_Category,inplace=True)
for i in binary_columns:
# 1. add new column and replace if category is null then 1 else 0
df_r1["Replaced"] = np.where((df_r1[i].isnull()) | df_r1["Replaced"]==1,1,0)
# 2 Take most occured category in that vairable (.mode())
Mode_Category = df_r1[i].mode()[0]
# 3 Replace NAN values with most occured category in actual vairable
df_r1[i].fillna(Mode_Category,inplace=True)
df = df_r1
ENCODING: In our dataset we have some categorical features with very high variability. Since Label Encoding woulD have had some porblems with those features we decided to apply the Frequency Encoding in those cases. While for features that we were not afraid about high variability we decided to adopt a label encoding. Also for these features we performed a standardization.
STANDARDIZATION: the step after is to standardize che previously encoded columns to avoid problems related to value ranges of different columns
df_comparison = df[categorical_columns]
frequency_encoded_variables = [
'Census_OEMModelIdentifier',
'CityIdentifier',
'Census_FirmwareVersionIdentifier',
'AvSigVersion',
'Census_ProcessorModelIdentifier',
'Census_OEMNameIdentifier',
'DefaultBrowsersIdentifier'
]
dic_freq_enc = retrive_values_enc(frequency_encoded_variables)
categorical_columns = [x for x in categorical_columns if x not in frequency_encoded_variables]
dic_label_enc = retrive_values_enc(categorical_columns, 'ordered')
As we said previoulsy, to avoid bias effects dependent on high range values a standardization has been performed. We are going to use lots of algorithms that are based on distances metrics, so we need to standardized in order to not get biased results.
# Standardize frequency econded features
scaler = StandardScaler()
scaled = scaler.fit_transform(df[frequency_encoded_variables])
df[frequency_encoded_variables] = scaled
# Standardize label econded features
scaled = scaler.fit_transform(df[categorical_columns])
df[categorical_columns] = scaled
df_std = df
# Standardization of numerical columns
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
Retrivial of original values from encoded and standardized ones to be able to perform analysis on data
retr_SmartScreen = retr('SmartScreen')
retr_processor = retr('Processor')
retr_AVPoductInstalled = retr('AVProductsInstalled')
OUTLIERS: in this section we deal with the presence of some outliers in numerical features.
We performed to remove the outlier detected on values obtained till this moment on the process of data cleaning. Our group decided not to iterate on the removal of outlier but focus just on the ones detected at the first anlysis: so after the drop of first outliers we didn't check for possible new ones not to compromise too much the dataset potential of information.
# Defining and saving iqr values for each feature
l_iqr = []
for i in range(len(numerical_columns)):
iqr_ = get_iqr_values(df_std, numerical_columns[i])
l_iqr.append(iqr_)
# Replace Nan with value -2 (value not present in any numerical features)
df_std.fillna(-2, inplace=True)
df_remo = df_std
At the beginning we store into a list the values needed for a boxplot (min, max, median, q1 and q3) of each feature in way to perform the removal of outliers based only on those values detected.
Secondly we managed to replane Nan values with the value -2 (not present in any numerical features) in way to perform removal of outliers without deleting rows with Nan values that otherwise were considered outliers.
It is important to underline that we perform the removal of outliers only on a dataframe that doesn't contain Nan values. Just in the end we concatenate Nan values with the complete dataframe cleaned by outliers.</font>
# Function to remove outliers of numerical columns
outlier_idx2 = []
print(f"Rows BEFORE removing: {df_std.shape[0]}")
for i in range(len(numerical_columns)):
nan_rows = df_remo[df_remo[numerical_columns[i]] == -2]
df_std = df_std[df_std[numerical_columns[i]] != -2]
numOutlier = count_outliers(df_std, numerical_columns[i], l_iqr[i])
Nan_ = nan_rows.shape[0]
print(numerical_columns[i], ": ")
print("numero di outlier prima: ",numOutlier)
print("numero di Nan prima: ",Nan_)
df_remo = remove_outliers(df_std, numerical_columns[i], l_iqr[i])
numOutlier = count_outliers(df_remo, numerical_columns[i], l_iqr[i])
print("numero di outlier dopo: ",numOutlier)
df_remo = pd.concat([df_remo, nan_rows], ignore_index=True)
Nan_ = (df_remo[numerical_columns[i]] == -2).sum()
print("numero di Nan dopo: ",Nan_, "\n")
boxplot = df_std.boxplot(column=[numerical_columns[i]], by="HasDetections", rot=0, grid=True, layout=(2,1), fontsize=15, figsize=(7, 10))
if numOutlier != 0:
outlier_idx2.append(numerical_columns[i])
print(f"Rows AFTER removing: {df_remo.shape[0]}")
print("\n\n The columns with more than 0 outliers: \n", outlier_idx2, "\n")
Rows BEFORE removing: 893687 Census_ProcessorCoreCount : numero di outlier prima: 99066 numero di Nan prima: 4072 numero di outlier dopo: 0 numero di Nan dopo: 4072 Census_PrimaryDiskTotalCapacity : numero di outlier prima: 784 numero di Nan prima: 4907 numero di outlier dopo: 0 numero di Nan dopo: 4907 Census_SystemVolumeTotalCapacity : numero di outlier prima: 7733 numero di Nan prima: 4906 numero di outlier dopo: 0 numero di Nan dopo: 4906 Census_TotalPhysicalRAM : numero di outlier prima: 62069 numero di Nan prima: 7339 numero di outlier dopo: 0 numero di Nan dopo: 7339 Census_InternalPrimaryDiagonalDisplaySizeInInches : numero di outlier prima: 92994 numero di Nan prima: 4320 numero di outlier dopo: 0 numero di Nan dopo: 4320 Census_InternalPrimaryDisplayResolutionHorizontal : numero di outlier prima: 9539 numero di Nan prima: 4309 numero di outlier dopo: 0 numero di Nan dopo: 4309 Census_InternalPrimaryDisplayResolutionVertical : numero di outlier prima: 17219 numero di Nan prima: 4309 numero di outlier dopo: 0 numero di Nan dopo: 4309 Census_InternalBatteryNumberOfCharges : numero di outlier prima: 0 numero di Nan prima: 21022 numero di outlier dopo: 0 numero di Nan dopo: 21022 Rows AFTER removing: 881545 The columns with more than 0 outliers: []
NAN VALUES OF NUMERICAL COLUMNS: The reason why we wanted to keep all Nan values of numerical columns stands on this section. We decided to replace those null values with ones predicted by a linear regression model trained on all other features. We adopted the treshold of R^2 > 0.6 for the linear regression models that we consider good enough to predict our missing numerical values. If the model doesn't perform well enough we just concluded by dropping the remaining rows containing Nan values.
# Defining the funciton to replace Nan values for numerical columns with a Linear Regression model prediction where R^2 > 0.6
df_r1 = df_remo
from sklearn.linear_model import LinearRegression
def predict_numerical (df_r1, num_var):
nan_rows = df_r1[df_r1[numerical_columns[num_var]] == -2] # Split df into 2 df: one with Nan values used as test and one
df_r1 = df_r1[df_r1[numerical_columns[num_var]] != -2] # without Nan used as train
X = df_r1.drop(columns = numerical_columns[num_var])
y = df_r1[numerical_columns[num_var]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1, random_state=42)
x_test= nan_rows.drop([numerical_columns[num_var]], axis=1)
y_test= nan_rows[numerical_columns[num_var]]
regressor = LinearRegression()
reg = regressor.fit(X_train, y_train)
y_pred = regressor.predict(x_test)
df1 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
if reg.score(X_train, y_train) > 0.6:
x_test[numerical_columns[num_var]] = df1['Predicted'] # add predicted values column to df with Nan where R^2 > 0.6
df_r1 = pd.concat([df_r1, x_test], ignore_index=True) # concat df with Nan values with df without Nan
print(reg.score(X_train, y_train))
else:
x_test[numerical_columns[num_var]] = df1['Actual'] # add Nan values column to df with Nan where R^2 < 0.6
df_r1 = pd.concat([df_r1, x_test ], ignore_index=True) # concat df with Nan values with df without Nan
print("df_shape:", df_r1.shape)
print('\n')
return df_r1
# Run the previous function to predict values for numerical columns replacing Nan values
for i in range(len(numerical_columns)):
print("variable_pred: ", numerical_columns[i])
if i == 0:
a = predict_numerical(df_r1, i)
else:
a = predict_numerical(a, i)
variable_pred: Census_ProcessorCoreCount df_shape: (881545, 68) variable_pred: Census_PrimaryDiskTotalCapacity df_shape: (881545, 68) variable_pred: Census_SystemVolumeTotalCapacity df_shape: (881545, 68) variable_pred: Census_TotalPhysicalRAM df_shape: (881545, 68) variable_pred: Census_InternalPrimaryDiagonalDisplaySizeInInches df_shape: (881545, 68) variable_pred: Census_InternalPrimaryDisplayResolutionHorizontal 0.8472491595277064 df_shape: (881545, 68) variable_pred: Census_InternalPrimaryDisplayResolutionVertical 0.8583013843140088 df_shape: (881545, 68) variable_pred: Census_InternalBatteryNumberOfCharges 0.8411773094717034 df_shape: (881545, 68)
# Removing all remaing row with at least one nan value in numerical columns
db_clean = a
for i in db_clean.columns:
db_clean = db_clean[db_clean[i].isin([-2]) == False]
print("rows removed for nan: ", a.shape[0]-db_clean.shape[0])
rows removed for nan: 1337
Some of our variables where very correlated to each other, generation the problem of multicollinearity.
They almost provide the same information, so since some of the algorithms that we will use are high influenced by the presence of correlated features (some other are completely immune by them like Classififcation Tree, Lasso etc), we decide to delete them.
df = db_clean
# Check where correlation among features is > 0.9
corr = df.corr()
high_corr = (corr >= 0.90).astype('uint8')
plt.figure(figsize=(15,15))
sns.heatmap(high_corr, cmap='RdBu_r', annot=True, center=0.0)
plt.show()
# For correlated features we remove the ones with more unique values
droppable = ['OsVer',
'Census_OSArchitecture',
'Census_OSBuildNumber',
'OsBuild',
'Census_OSBranch',
'Census_InternalPrimaryDisplayResolutionHorizontal',
'Census_OSVersion',
'Census_OSUILocaleIdentifier']
df = df.drop(columns=droppable)
# drop Census_InternalPrimaryDisplayResolutionHorizontal from the list of the numerical columns
numerical_columns = [
'Census_ProcessorCoreCount',
'Census_PrimaryDiskTotalCapacity',
'Census_SystemVolumeTotalCapacity',
'Census_TotalPhysicalRAM',
'Census_InternalPrimaryDiagonalDisplaySizeInInches',
'Census_InternalPrimaryDisplayResolutionVertical',
'Census_InternalBatteryNumberOfCharges'
]
binary_columns = [j for j in df.columns if df[j].nunique() == 2]
categorical_columns = [y for y in df.columns if (y not in numerical_columns) & (y not in binary_columns)]
# show of the final dataset cleaned
df
| EngineVersion | AppVersion | AvSigVersion | RtpStateBitfield | IsSxsPassiveMode | DefaultBrowsersIdentifier | AVProductStatesIdentifier | AVProductsInstalled | AVProductsEnabled | CountryIdentifier | CityIdentifier | OrganizationIdentifier | GeoNameIdentifier | LocaleEnglishNameIdentifier | Platform | Processor | OsSuite | OsPlatformSubRelease | OsBuildLab | SkuEdition | IsProtected | IeVerIdentifier | SmartScreen | Firewall | Census_MDC2FormFactor | Census_OEMNameIdentifier | Census_OEMModelIdentifier | Census_ProcessorCoreCount | Census_ProcessorManufacturerIdentifier | Census_ProcessorModelIdentifier | Census_PrimaryDiskTotalCapacity | Census_PrimaryDiskTypeName | Census_SystemVolumeTotalCapacity | Census_HasOpticalDiskDrive | Census_TotalPhysicalRAM | Census_ChassisTypeName | Census_InternalPrimaryDiagonalDisplaySizeInInches | Census_InternalPrimaryDisplayResolutionVertical | Census_PowerPlatformRoleName | Census_InternalBatteryType | Census_InternalBatteryNumberOfCharges | Census_OSBuildRevision | Census_OSEdition | Census_OSSkuName | Census_OSInstallTypeName | Census_OSInstallLanguageIdentifier | Census_OSWUAutoUpdateOptionsName | Census_GenuineStateName | Census_ActivationChannel | Census_FlightRing | Census_FirmwareManufacturerIdentifier | Census_FirmwareVersionIdentifier | Census_IsSecureBootEnabled | Census_IsTouchEnabled | Census_IsPenCapable | Census_IsAlwaysOnAlwaysConnectedCapable | Wdft_IsGamer | Wdft_RegionIdentifier | HasDetections | Replaced | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.363289 | -0.947542 | 2.687518 | 0.16531 | 0 | 0.213221 | -1.405365 | -1.403245 | 0.16366 | -1.378746 | -0.330496 | 0.529803 | -0.239250 | -0.298198 | 0.187641 | 0.318115 | 0.771900 | 0.959061 | 1.162692 | 0.737851 | 1.0 | 1.059598 | 0.436024 | 0.0 | 0.729268 | 0.385986 | -0.261549 | 0.004065 | -2.749277 | -0.788344 | -0.001010 | 0.702482 | 1.764481 | 0 | 0.418202 | 0.812828 | 0.106647 | 0.710947 | 0.653900 | 0.257044 | -0.596106 | 0.895222 | 0.718437 | 0.690484 | 1.328502 | 1.302601 | 1.006942 | 0.364058 | 0.715606 | 0.259994 | -0.279175 | 0.270382 | 1 | 0 | 0 | 0.0 | 0.0 | 0.168204 | 1 | 1 |
| 1 | -2.222032 | -1.300421 | -0.722467 | 0.16531 | 0 | 0.213221 | 0.721652 | 0.643383 | 0.16366 | -0.195324 | -0.331622 | 0.529803 | 2.134299 | 1.783846 | 0.187641 | 0.318115 | -1.260527 | -1.333953 | -1.193242 | -1.060307 | 1.0 | -1.249199 | 0.436024 | 1.0 | 0.729268 | 0.385986 | -0.373138 | 0.004065 | 0.363695 | -0.738270 | -0.001085 | 0.702482 | -1.000243 | 0 | -0.406135 | 0.812828 | -0.454777 | -0.603533 | 0.653900 | 0.257044 | -0.596106 | -0.914559 | 0.315254 | 0.346469 | -0.098121 | 1.302601 | 1.006942 | 0.364058 | 0.715606 | 0.259994 | -1.025708 | -0.461681 | 0 | 0 | 0 | 0.0 | 0.0 | -0.014470 | 0 | 1 |
| 2 | -2.351148 | -1.151868 | 2.235865 | 0.16531 | 0 | -4.695658 | -1.321083 | -1.403245 | 0.16366 | 1.202320 | -0.364150 | 0.529803 | -0.457982 | -0.601796 | 0.187641 | 0.318115 | 0.771900 | -0.058117 | -0.398772 | 0.737851 | 1.0 | -0.256914 | 0.436024 | 1.0 | 0.729268 | 0.631297 | 0.330406 | 0.004065 | 0.363695 | -0.739483 | -0.001010 | 0.702482 | 1.713965 | 0 | -0.406135 | 0.812828 | -0.199584 | 0.850786 | 0.653900 | 0.257044 | -0.596106 | 0.151251 | -1.138097 | -1.223920 | -1.155319 | 1.302601 | -0.426188 | 0.364058 | -0.353922 | 0.259994 | -0.279175 | 0.491196 | 1 | 0 | 0 | 0.0 | 0.0 | -1.576833 | 1 | 1 |
| 3 | -2.403825 | -1.196187 | -0.810817 | 0.16531 | 0 | -4.687355 | -1.265009 | -1.403245 | 0.16366 | -0.390214 | 0.581644 | -1.781634 | -0.141997 | -0.385551 | 0.187641 | 0.318115 | -1.260527 | 0.959061 | 1.162692 | -1.060307 | 1.0 | 1.059598 | 0.436024 | 1.0 | -1.003898 | -1.015986 | -0.393543 | -0.961257 | 0.363695 | -0.822050 | -0.001103 | -2.250794 | -0.691478 | 0 | -0.818304 | -0.806965 | 0.565994 | 0.011756 | -1.279007 | 0.257044 | 1.677555 | 0.895222 | 0.315254 | 0.346469 | 1.328502 | -0.302571 | 1.006942 | -2.684447 | 0.715606 | 0.259994 | -1.313945 | 0.178727 | 0 | 0 | 0 | 0.0 | 0.0 | 1.404262 | 0 | 1 |
| 4 | 0.363289 | 0.855322 | -0.344900 | 0.16531 | 0 | 0.213221 | 0.721652 | 0.643383 | 0.16366 | -0.667220 | -0.242141 | 0.529803 | 2.134299 | 1.783846 | 0.187641 | 0.318115 | -1.260527 | 0.959061 | 1.162692 | -1.060307 | 1.0 | 1.059598 | 0.436024 | 1.0 | -1.003898 | 0.442896 | -0.334454 | 0.004065 | 0.363695 | -0.672314 | -0.001065 | 0.702482 | 0.304518 | 0 | 2.066875 | -1.695322 | 2.607537 | 0.850786 | -1.279007 | 0.257044 | 1.677555 | -0.256413 | 0.315254 | 0.346469 | 1.328502 | 1.302601 | -0.426188 | 0.364058 | 0.715606 | 0.259994 | -0.495993 | 0.634186 | 0 | 0 | 0 | 0.0 | 0.0 | -1.899196 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 880203 | 0.363289 | -1.189750 | 0.982328 | 0.16531 | 0 | 0.213221 | -1.414682 | 0.643383 | 0.16366 | 0.289092 | 0.006379 | -1.781634 | 2.134299 | 1.783846 | -5.369968 | -3.143332 | -1.260527 | -1.790996 | -1.203592 | -3.544700 | 1.0 | -1.304578 | 0.436024 | 1.0 | -1.003898 | 0.442896 | -0.390355 | -0.961257 | 0.363695 | -0.752456 | -0.001093 | -2.250794 | -0.790290 | 0 | -0.818304 | -0.806965 | -0.284649 | -0.603533 | -1.279007 | 0.257044 | 1.254422 | -0.943657 | 0.315254 | 0.346469 | -0.098121 | 1.302601 | 1.006942 | 0.364058 | 0.715606 | 0.259994 | -0.495993 | -0.369324 | 0 | 0 | 0 | 0.0 | 0.0 | -0.014470 | 1 | 1 |
| 880204 | 0.363289 | -1.218308 | 0.100414 | 0.16531 | 0 | 0.213221 | 0.721652 | 0.643383 | 0.16366 | 0.289092 | -0.324193 | 0.529803 | 2.134299 | 1.783846 | 0.187641 | 0.318115 | -1.260527 | -1.560201 | -1.111628 | -1.060307 | 1.0 | -1.196394 | 0.436024 | 1.0 | -1.003898 | -1.342150 | 1.088576 | 0.004065 | 0.363695 | -0.799377 | -0.001085 | -2.250794 | -0.220704 | 0 | -0.818304 | -0.806965 | 0.906251 | 0.710947 | -1.279007 | 0.257044 | 1.510713 | -0.911289 | 0.315254 | 0.346469 | -0.010080 | 1.302601 | -0.426188 | -2.684447 | 0.715606 | 0.259994 | 1.353931 | -0.257744 | 0 | 0 | 0 | 0.0 | 1.0 | -0.014470 | 1 | 1 |
| 880205 | 0.363289 | -1.189750 | -0.442759 | 0.16531 | 0 | 0.213221 | 0.721652 | 0.643383 | 0.16366 | 0.706949 | -0.342539 | 0.529803 | -0.141997 | -0.385551 | -5.302712 | 0.318115 | 0.771900 | -1.716878 | -1.171436 | 0.737851 | 1.0 | -1.225039 | -2.185498 | 1.0 | -1.003898 | -1.545868 | -0.394287 | 0.004065 | -2.749277 | -0.830658 | -0.001065 | 0.702482 | -0.451378 | 0 | -0.406135 | -0.806965 | 0.855213 | 0.011756 | -1.279007 | 0.257044 | 1.319771 | -0.888813 | 0.718437 | 0.690484 | -0.098121 | -0.302571 | 1.006942 | 0.364058 | 0.715606 | 0.259994 | 1.353931 | -0.366745 | 1 | 0 | 0 | 0.0 | 0.0 | -0.371381 | 1 | 1 |
| 880206 | 0.494191 | -1.189750 | -0.790611 | 0.16531 | 0 | 0.213221 | -1.414682 | 0.643383 | 0.16366 | 1.180778 | -0.336574 | 0.529803 | 0.008666 | -0.249204 | -5.369968 | -3.143332 | 0.771900 | -1.790996 | -1.203541 | -3.544700 | 1.0 | -1.304464 | 0.436024 | 1.0 | 0.729268 | -1.627140 | -0.393968 | -0.961257 | -2.749277 | -0.806894 | -0.001093 | -2.250794 | -0.463218 | 0 | -0.818304 | 0.812828 | -0.199584 | -0.603533 | 0.653900 | 0.257044 | -0.533143 | -0.920002 | 0.315254 | 0.346469 | -0.098121 | -0.698475 | 1.006942 | 0.364058 | 0.715606 | 0.259994 | -1.385781 | 0.012999 | 0 | 0 | 0 | 0.0 | 0.0 | 0.091632 | 1 | 1 |
| 880207 | 0.363289 | 0.855322 | -0.484755 | 0.16531 | 0 | 0.213221 | 0.721652 | 0.643383 | 0.16366 | 0.273902 | -0.029976 | 0.529803 | -0.509433 | -0.616827 | 0.187641 | -3.143332 | 0.771900 | -1.333953 | -1.182904 | 0.737851 | 1.0 | -1.105984 | 0.436024 | 1.0 | -1.872933 | -1.629253 | -0.288543 | 0.004065 | 0.363695 | -0.235473 | -0.001117 | -1.084632 | -1.067633 | 0 | -0.818304 | -0.806965 | -1.033214 | 1.783041 | -2.022015 | 0.257044 | -0.119487 | -0.509088 | -1.138097 | -1.223920 | -1.155319 | -0.792638 | -0.426188 | 0.364058 | -0.353922 | 0.259994 | 1.353931 | 0.245769 | 0 | 1 | 0 | 1.0 | 0.0 | 1.404262 | 0 | 1 |
880208 rows × 60 columns
**COMMENT**:
In the previous analysis we saw that we have some MULTI-COLLINEARITY PROBLEM, we tried to fix by removing the correlated features, but in addition we decided to perform also the Principal Component Analysis.
The PCA is a technique particularly useful in processing data where multi-colinearity exists. Moreover PCA can be used when the dimensions of the input features are high and can be also used for denoising and data compression. We know that is has some problems with the categorical features, but we implemented it, just to see how the different algorithm perform. Indeed at the end it did not perform better then the cleaned database. </font>
!pip install dash
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
# we divide the dataset
X = df.loc[:, ~df.columns.isin(['HasDetections'])]
y = df['HasDetections']
X_train, X_test, y_train_pca, y_test_pca = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
# intialize pca and
pca = PCA()
# to look at which component should we use to go on with our analysis -> look at the variance explained by each variable
pca.fit(X_train)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)
px.area(
x=range(1, exp_var_cumul.shape[0] + 1),
y=exp_var_cumul,
labels={"x": "# Components", "y": "Explained Variance"}
)
Collecting dash
Downloading dash-2.0.0-py3-none-any.whl (7.3 MB)
|████████████████████████████████| 7.3 MB 5.4 MB/s
Collecting plotly>=5.0.0
Downloading plotly-5.4.0-py2.py3-none-any.whl (25.3 MB)
|████████████████████████████████| 25.3 MB 1.2 MB/s
Collecting dash-core-components==2.0.0
Downloading dash_core_components-2.0.0.tar.gz (3.4 kB)
Collecting dash-table==5.0.0
Downloading dash_table-5.0.0.tar.gz (3.4 kB)
Collecting dash-html-components==2.0.0
Downloading dash_html_components-2.0.0.tar.gz (3.8 kB)
Requirement already satisfied: Flask>=1.0.4 in /usr/local/lib/python3.7/dist-packages (from dash) (1.1.4)
Collecting flask-compress
Downloading Flask_Compress-1.10.1-py3-none-any.whl (7.9 kB)
Requirement already satisfied: Werkzeug<2.0,>=0.15 in /usr/local/lib/python3.7/dist-packages (from Flask>=1.0.4->dash) (1.0.1)
Requirement already satisfied: itsdangerous<2.0,>=0.24 in /usr/local/lib/python3.7/dist-packages (from Flask>=1.0.4->dash) (1.1.0)
Requirement already satisfied: Jinja2<3.0,>=2.10.1 in /usr/local/lib/python3.7/dist-packages (from Flask>=1.0.4->dash) (2.11.3)
Requirement already satisfied: click<8.0,>=5.1 in /usr/local/lib/python3.7/dist-packages (from Flask>=1.0.4->dash) (7.1.2)
Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2<3.0,>=2.10.1->Flask>=1.0.4->dash) (2.0.1)
Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from plotly>=5.0.0->dash) (1.15.0)
Collecting tenacity>=6.2.0
Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Collecting brotli
Downloading Brotli-1.0.9-cp37-cp37m-manylinux1_x86_64.whl (357 kB)
|████████████████████████████████| 357 kB 65.5 MB/s
Building wheels for collected packages: dash-core-components, dash-html-components, dash-table
Building wheel for dash-core-components (setup.py) ... done
Created wheel for dash-core-components: filename=dash_core_components-2.0.0-py3-none-any.whl size=3821 sha256=49cc5eca7b1df6f176660f056f00ea8d8899795d374e63085c9bb2e5adf19157
Stored in directory: /root/.cache/pip/wheels/00/f9/c7/1a6437d794ed753ea9bc9079e761d4fc803a1f1f5d3697b9ec
Building wheel for dash-html-components (setup.py) ... done
Created wheel for dash-html-components: filename=dash_html_components-2.0.0-py3-none-any.whl size=4089 sha256=ec3960f9edbdc3d6831469285e5b782396a773a74ce81ef45fcb09bcb7f350df
Stored in directory: /root/.cache/pip/wheels/ec/6b/81/05aceabd8b27f724e2c96784016287cc1bfbc349ebfda451de
Building wheel for dash-table (setup.py) ... done
Created wheel for dash-table: filename=dash_table-5.0.0-py3-none-any.whl size=3911 sha256=6bc2920f33fb40ea2ab3e349cf992d21d4411d0d4175f7deaa6b4d4591da3125
Stored in directory: /root/.cache/pip/wheels/85/5d/4e/7c276b57992951dbe770bf5caad6448d0539c510663aefd2e2
Successfully built dash-core-components dash-html-components dash-table
Installing collected packages: tenacity, brotli, plotly, flask-compress, dash-table, dash-html-components, dash-core-components, dash
Attempting uninstall: plotly
Found existing installation: plotly 4.4.1
Uninstalling plotly-4.4.1:
Successfully uninstalled plotly-4.4.1
Successfully installed brotli-1.0.9 dash-2.0.0 dash-core-components-2.0.0 dash-html-components-2.0.0 dash-table-5.0.0 flask-compress-1.10.1 plotly-5.4.0 tenacity-8.0.1
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: UserWarning: The dash_core_components package is deprecated. Please replace `import dash_core_components as dcc` with `from dash import dcc` This is separate from the ipykernel package so we can avoid doing imports until /usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:4: UserWarning: The dash_html_components package is deprecated. Please replace `import dash_html_components as html` with `from dash import html` after removing the cwd from sys.path.
# We need only the calculated resulting components scores for the elements in our data set
pca = PCA(n_components=45)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
# Before all else, we’ll create a new data frame.
# It allows us to add in the values of the separate components to our segmentation data set.
# The components’ scores are stored in the ‘scores P C A’ variable. Let’s label them Component 1, 2 and 3. In addition, we also append the ‘K means P C A’ labels to the new data frame.
df_pca_plot = pd.concat([df.reset_index(drop = True), pd.DataFrame(X_train_pca)], axis= 1)
# we generate the name of the columns
numbers = range(1, 46)
comp_name = []
for i in numbers:
name = ('Component_%d' %i)
comp_name.append(name)
# add the name
df_pca_plot.columns.values[-45: ] = [j for j in comp_name]
df_pca_plot.dropna(inplace=True)
df_pca_plot
| EngineVersion | AppVersion | AvSigVersion | RtpStateBitfield | IsSxsPassiveMode | DefaultBrowsersIdentifier | AVProductStatesIdentifier | AVProductsInstalled | AVProductsEnabled | CountryIdentifier | CityIdentifier | OrganizationIdentifier | GeoNameIdentifier | LocaleEnglishNameIdentifier | Platform | Processor | OsSuite | OsPlatformSubRelease | OsBuildLab | SkuEdition | IsProtected | IeVerIdentifier | SmartScreen | Firewall | Census_MDC2FormFactor | Census_OEMNameIdentifier | Census_OEMModelIdentifier | Census_ProcessorCoreCount | Census_ProcessorManufacturerIdentifier | Census_ProcessorModelIdentifier | Census_PrimaryDiskTotalCapacity | Census_PrimaryDiskTypeName | Census_SystemVolumeTotalCapacity | Census_HasOpticalDiskDrive | Census_TotalPhysicalRAM | Census_ChassisTypeName | Census_InternalPrimaryDiagonalDisplaySizeInInches | Census_InternalPrimaryDisplayResolutionVertical | Census_PowerPlatformRoleName | Census_InternalBatteryType | ... | Component_6 | Component_7 | Component_8 | Component_9 | Component_10 | Component_11 | Component_12 | Component_13 | Component_14 | Component_15 | Component_16 | Component_17 | Component_18 | Component_19 | Component_20 | Component_21 | Component_22 | Component_23 | Component_24 | Component_25 | Component_26 | Component_27 | Component_28 | Component_29 | Component_30 | Component_31 | Component_32 | Component_33 | Component_34 | Component_35 | Component_36 | Component_37 | Component_38 | Component_39 | Component_40 | Component_41 | Component_42 | Component_43 | Component_44 | Component_45 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.363289 | -0.947542 | 2.687518 | 0.16531 | 0 | 0.213221 | -1.405365 | -1.403245 | 0.16366 | -1.378746 | -0.330496 | 0.529803 | -0.239250 | -0.298198 | 0.187641 | 0.318115 | 0.771900 | 0.959061 | 1.162692 | 0.737851 | 1.0 | 1.059598 | 0.436024 | 0.0 | 0.729268 | 0.385986 | -0.261549 | 0.004065 | -2.749277 | -0.788344 | -0.001010 | 0.702482 | 1.764481 | 0 | 0.418202 | 0.812828 | 0.106647 | 0.710947 | 0.653900 | 0.257044 | ... | -1.957477 | 0.629108 | 0.814590 | 0.725828 | -2.080214 | 0.621561 | 1.924248 | 1.892330 | 1.193359 | -2.047200 | -0.103726 | -1.090992 | -1.722345 | -0.526640 | 0.937734 | 0.107911 | -0.436765 | 1.093764 | 1.066728 | -1.079583 | -2.315074 | -0.800172 | -0.046247 | 1.114286 | 0.726463 | -1.151985 | 0.841432 | 0.155154 | 1.842144 | -1.122748 | -0.993066 | -0.252133 | -0.662632 | 0.867882 | 0.078541 | -0.216206 | -0.007832 | -0.251761 | -0.008042 | 0.232440 |
| 1 | -2.222032 | -1.300421 | -0.722467 | 0.16531 | 0 | 0.213221 | 0.721652 | 0.643383 | 0.16366 | -0.195324 | -0.331622 | 0.529803 | 2.134299 | 1.783846 | 0.187641 | 0.318115 | -1.260527 | -1.333953 | -1.193242 | -1.060307 | 1.0 | -1.249199 | 0.436024 | 1.0 | 0.729268 | 0.385986 | -0.373138 | 0.004065 | 0.363695 | -0.738270 | -0.001085 | 0.702482 | -1.000243 | 0 | -0.406135 | 0.812828 | -0.454777 | -0.603533 | 0.653900 | 0.257044 | ... | -0.562618 | -0.395382 | 0.340131 | -0.156550 | 0.420867 | -1.290567 | 1.582237 | -0.811965 | -0.332183 | -1.156589 | -0.237826 | 0.199534 | 0.220768 | 0.020921 | 0.747134 | -1.664373 | -0.384548 | -1.435330 | -0.434752 | -0.580889 | 0.478209 | -0.959038 | 0.161027 | 0.213857 | 0.420620 | 0.943526 | 0.616686 | -1.328859 | -0.525872 | 0.849405 | -1.274572 | -1.270506 | -0.657864 | -0.289306 | -0.498985 | 0.467136 | -0.822764 | 0.369468 | 0.193553 | -0.153698 |
| 2 | -2.351148 | -1.151868 | 2.235865 | 0.16531 | 0 | -4.695658 | -1.321083 | -1.403245 | 0.16366 | 1.202320 | -0.364150 | 0.529803 | -0.457982 | -0.601796 | 0.187641 | 0.318115 | 0.771900 | -0.058117 | -0.398772 | 0.737851 | 1.0 | -0.256914 | 0.436024 | 1.0 | 0.729268 | 0.631297 | 0.330406 | 0.004065 | 0.363695 | -0.739483 | -0.001010 | 0.702482 | 1.713965 | 0 | -0.406135 | 0.812828 | -0.199584 | 0.850786 | 0.653900 | 0.257044 | ... | -0.729734 | 2.942301 | -0.416398 | 1.231326 | -3.603010 | 1.246736 | 1.323255 | 1.276949 | 0.784888 | -2.335775 | -1.188696 | -0.160518 | -0.125785 | 0.238578 | 0.302600 | -0.695882 | 0.070475 | -0.704158 | -0.525731 | -0.781778 | -2.115703 | -0.276583 | -0.335755 | -0.872664 | 0.380599 | -0.749832 | 0.253777 | -0.632520 | 1.207754 | 0.326587 | -2.284840 | -0.000827 | -0.540569 | -0.669395 | 0.252484 | 0.320895 | -0.041174 | 0.190647 | -0.049171 | -0.222756 |
| 3 | -2.403825 | -1.196187 | -0.810817 | 0.16531 | 0 | -4.687355 | -1.265009 | -1.403245 | 0.16366 | -0.390214 | 0.581644 | -1.781634 | -0.141997 | -0.385551 | 0.187641 | 0.318115 | -1.260527 | 0.959061 | 1.162692 | -1.060307 | 1.0 | 1.059598 | 0.436024 | 1.0 | -1.003898 | -1.015986 | -0.393543 | -0.961257 | 0.363695 | -0.822050 | -0.001103 | -2.250794 | -0.691478 | 0 | -0.818304 | -0.806965 | 0.565994 | 0.011756 | -1.279007 | 0.257044 | ... | -1.311755 | 0.608063 | -1.321905 | 0.037838 | 0.062945 | -0.130921 | -0.748922 | 1.916918 | 0.187970 | 0.069937 | 0.491781 | 1.276415 | -0.623689 | 0.448443 | -0.738982 | 0.110995 | 1.687109 | -0.270675 | -0.344655 | -0.387595 | 0.583812 | 0.245619 | 0.683941 | 1.303579 | -0.499880 | 0.188339 | -0.786545 | 0.547929 | -0.019053 | 0.221749 | -0.044659 | -0.776954 | 0.267281 | 1.783582 | -0.458595 | -0.246317 | 0.174731 | 0.056575 | 0.154809 | -0.225970 |
| 4 | 0.363289 | 0.855322 | -0.344900 | 0.16531 | 0 | 0.213221 | 0.721652 | 0.643383 | 0.16366 | -0.667220 | -0.242141 | 0.529803 | 2.134299 | 1.783846 | 0.187641 | 0.318115 | -1.260527 | 0.959061 | 1.162692 | -1.060307 | 1.0 | 1.059598 | 0.436024 | 1.0 | -1.003898 | 0.442896 | -0.334454 | 0.004065 | 0.363695 | -0.672314 | -0.001065 | 0.702482 | 0.304518 | 0 | 2.066875 | -1.695322 | 2.607537 | 0.850786 | -1.279007 | 0.257044 | ... | 0.672804 | -0.855163 | 1.842754 | -0.023630 | -0.036024 | 0.357926 | 1.621571 | -1.270154 | -0.867470 | 0.722114 | -2.403146 | -0.914708 | -1.250136 | -0.260741 | 0.688596 | -0.364860 | 0.035049 | -0.546823 | 0.282372 | 0.491577 | -0.351360 | 0.297348 | 0.132486 | 0.128214 | -0.046435 | 0.339433 | 0.026101 | -0.417569 | -0.393173 | -0.907656 | -0.244848 | 0.331430 | 0.133146 | 0.218161 | -0.627086 | -0.162561 | -0.706735 | 0.045086 | -0.723417 | -0.156737 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 616140 | 0.363289 | 0.855322 | 1.773514 | 0.16531 | 0 | 0.213221 | 0.721652 | 0.643383 | 0.16366 | 0.339724 | -0.359873 | -1.781634 | 2.134299 | 1.783846 | 0.187641 | 0.318115 | -1.260527 | 0.959061 | 1.162692 | -1.060307 | 1.0 | 1.059598 | -2.556236 | 1.0 | -1.716302 | 0.442896 | -0.392055 | 0.004065 | 0.363695 | -0.440859 | -0.001093 | 0.702482 | -0.975567 | 0 | -0.406135 | -1.379888 | -0.590880 | -0.603533 | 0.653900 | 0.257044 | ... | 0.411481 | -0.922693 | 1.089123 | 0.084835 | -0.337558 | -0.430782 | 1.605622 | -0.930047 | -0.819493 | 0.502820 | -0.082539 | -1.346070 | 1.078163 | -0.315991 | -0.179756 | 0.054516 | -0.384233 | 0.507226 | -0.828700 | -0.694798 | 0.676404 | 1.150300 | 0.646041 | 0.172724 | -0.105200 | -0.059383 | 0.484835 | -0.531479 | -0.566294 | -0.709681 | 0.411516 | 0.049840 | 0.263218 | -0.582508 | 0.720052 | 0.771229 | -0.748459 | 0.064320 | -0.734480 | -0.121342 |
| 616141 | 0.494191 | 0.855322 | -0.560030 | 0.16531 | 0 | 0.213221 | 0.721652 | 0.643383 | 0.16366 | 0.041176 | 0.356310 | 0.529803 | -0.497291 | -0.608606 | 0.187641 | 0.318115 | 0.771900 | 0.959061 | 1.162692 | 0.737851 | 1.0 | 1.059598 | -2.185498 | 1.0 | -1.003898 | -0.966361 | 3.907854 | 0.004065 | 0.363695 | -0.774886 | -0.001010 | 0.702482 | 0.304352 | 0 | 0.418202 | -0.806965 | 0.821187 | 0.011756 | -1.279007 | 0.257044 | ... | -1.093223 | 1.485666 | -0.537841 | 0.181755 | -0.422383 | 0.698154 | 0.152651 | 0.036168 | -0.938917 | 0.750376 | -1.575185 | 0.110755 | -1.071505 | -0.100921 | 0.115446 | 0.007519 | -0.107868 | -0.778462 | 0.209646 | -0.191009 | -0.293346 | 0.290561 | 0.490719 | 0.308950 | -1.296284 | 0.684798 | 0.629969 | -0.694288 | 0.314649 | -0.759276 | -0.242510 | 0.019394 | 0.000630 | -1.111114 | -0.578695 | 0.343663 | 0.266241 | 0.095924 | 0.312667 | 0.006378 |
| 616142 | 0.494191 | 0.855322 | -0.540617 | 0.16531 | 0 | 0.213221 | 0.721652 | 0.643383 | 0.16366 | -0.592836 | -0.250245 | 0.529803 | -0.733487 | 1.783846 | 0.187641 | 0.318115 | -1.260527 | 0.959061 | 1.162692 | -1.060307 | 1.0 | 1.059598 | -2.556236 | 1.0 | 0.729268 | 0.209754 | -0.269519 | 0.004065 | 0.363695 | 0.516481 | -0.001065 | 0.702482 | -0.951761 | 0 | -0.406135 | 0.812828 | -0.199584 | -0.603533 | 0.653900 | 0.257044 | ... | -2.318469 | 0.341883 | -0.182784 | 0.144295 | -0.381123 | -1.505789 | 0.971182 | 2.217736 | -1.046138 | 2.186006 | -0.108159 | 2.225366 | 0.895461 | 0.255348 | -1.079228 | -0.657736 | -1.482439 | -1.348456 | -0.290493 | 0.764794 | -0.618738 | 0.270068 | -0.222696 | -0.331784 | -0.641229 | -0.322306 | 0.036170 | -0.164811 | 0.050068 | -0.239081 | -0.197095 | -0.081593 | -0.362389 | -0.126722 | 0.588381 | -0.266744 | 0.031482 | -0.010319 | -0.645969 | 0.133218 |
| 616143 | 0.494191 | -1.208620 | -0.345297 | 0.16531 | 0 | 0.213221 | -1.441353 | -1.403245 | 0.16366 | 0.496593 | -0.366851 | -1.781634 | -0.119057 | -0.347031 | 0.187641 | 0.318115 | -1.260527 | 0.959061 | 1.162692 | -1.060307 | 1.0 | 1.059598 | -2.536446 | 1.0 | -1.003898 | 0.442896 | -0.223183 | 0.004065 | 0.363695 | -0.325193 | -0.001092 | -1.084632 | -0.441216 | 0 | 0.418202 | -0.806965 | 1.212483 | 0.850786 | -1.279007 | 0.257044 | ... | 0.047547 | -1.302907 | -0.394477 | -0.166584 | 0.534297 | -0.123243 | 0.807279 | -0.318297 | 0.059526 | -0.144524 | -0.268196 | 0.409116 | -0.649791 | 0.623461 | -0.314856 | -0.074538 | -0.071399 | 0.336175 | -0.579340 | -0.608614 | 0.953821 | -0.036146 | 0.143437 | 0.054572 | -0.208300 | 0.437769 | 0.221507 | -0.498697 | 1.071868 | -1.411232 | -0.460213 | -0.142802 | 0.210200 | -2.120856 | -0.295727 | 0.408017 | 1.059846 | 0.201672 | -0.738446 | -1.491204 |
| 616144 | 0.494191 | 0.855322 | -0.453852 | 0.16531 | 0 | 0.213221 | -1.432949 | 0.643383 | 0.16366 | 0.152292 | -0.357734 | 0.529803 | -0.474793 | -0.593834 | 0.187641 | 0.318115 | 0.771900 | 0.959061 | 1.162692 | 0.737851 | 0.0 | 1.059598 | 0.436024 | 1.0 | -1.003898 | -1.015986 | -0.380684 | 0.004065 | 0.363695 | -0.616905 | -0.001093 | -1.084632 | -0.428440 | 0 | 2.066875 | -0.806965 | 1.161444 | 0.850786 | -1.279007 | 0.257044 | ... | 0.071473 | 0.303377 | 1.929815 | 0.282412 | -1.006003 | -1.034610 | -2.179172 | -0.621216 | -0.802599 | 0.084676 | 0.310662 | 1.849581 | 0.089153 | -1.235153 | -0.114180 | -0.466221 | 0.599264 | 0.681236 | 0.219596 | 0.183807 | 0.151702 | -0.405645 | -0.561075 | -0.209841 | 0.005464 | -0.634854 | -0.333728 | -1.020405 | 0.435072 | -0.341021 | -0.125067 | 0.468709 | 0.434718 | -0.031058 | -0.079841 | -0.088603 | 0.424583 | -0.035974 | 0.395109 | 0.119918 |
616145 rows × 105 columns
df_random_forest
**COMMENT**:
We avoid he problem caused by the implementation of the PCA iwth lots of categorical features, we decided to try also the RandomForest for features selection. The tree-based strategies used by random forests naturally ranks by how well they improve the purity of the node. This mean decrease in impurity over all trees (called gini impurity). Neverthenless also RandomForest carries with him some problem, beacuse it is very higly influenced by features with high variability. Indeed at the end it did not perform better then the cleaned database. </font>
#DEFINE XTEST YTEST ETC
X = df.loc[:, ~df.columns.isin(['HasDetections'])]
y = df['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
feature_names = [f"feature {i}" for i in range(X.shape[1])]
forest = RandomForestClassifier(random_state=42)
forest.fit(X_train, y_train)
RandomForestClassifier(random_state=42)
start_time = time.time() #here we calculate the importances in an array, printing the time passed during calculation
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) #computes standard deviation
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
Elapsed time to compute the importances: 0.777 seconds
importances = forest.feature_importances_
print(importances.mean())
#print("{:10.7f}".format(np.median(importances)))
#print(importances.max())
zip_importances = zip(importances, X.columns)
dictionary_importances = dict(zip_importances)
#print(dictionary_importances)
greater = {k:v for (k,v) in dictionary_importances.items() if k > importances.mean()}
print(greater)
print(len(greater))
0.016949152542372885
{0.017210912241046754: 'EngineVersion', 0.020529363357150977: 'AppVersion', 0.062447103927776706: 'AvSigVersion', 0.026036918010088856: 'AVProductStatesIdentifier', 0.03937926512424226: 'CountryIdentifier', 0.05410188409471062: 'CityIdentifier', 0.03244270687169945: 'GeoNameIdentifier', 0.0301790299717791: 'LocaleEnglishNameIdentifier', 0.018767477302057987: 'OsBuildLab', 0.017054462606528004: 'IeVerIdentifier', 0.04428519924493529: 'SmartScreen', 0.027471181060297684: 'Census_OEMNameIdentifier', 0.046779300602724665: 'Census_OEMModelIdentifier', 0.04670775174301736: 'Census_ProcessorModelIdentifier', 0.022093210156214838: 'Census_PrimaryDiskTotalCapacity', 0.056145571453692436: 'Census_SystemVolumeTotalCapacity', 0.01896472948883158: 'Census_TotalPhysicalRAM', 0.03197029285362355: 'Census_InternalPrimaryDiagonalDisplaySizeInInches', 0.03613230248658253: 'Census_OSBuildRevision', 0.022794557763139926: 'Census_OSInstallTypeName', 0.023523979377846078: 'Census_OSInstallLanguageIdentifier', 0.021822672125697093: 'Census_FirmwareManufacturerIdentifier', 0.052328505015126585: 'Census_FirmwareVersionIdentifier', 0.022658661715082937: 'Wdft_RegionIdentifier'}
24
df_random_forest = df[list(greater.values())]
forest_importances = pd.Series(importances, X.columns)
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
ax.autoscale()
fig.set_size_inches(18.5, 10.5, forward=True)
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=10, hspace=None)
fig.tight_layout(pad=3)
df_random_forest # df with feature that has importance > importance.mean
| EngineVersion | AppVersion | AvSigVersion | AVProductStatesIdentifier | CountryIdentifier | CityIdentifier | GeoNameIdentifier | LocaleEnglishNameIdentifier | OsBuildLab | IeVerIdentifier | SmartScreen | Census_OEMNameIdentifier | Census_OEMModelIdentifier | Census_ProcessorModelIdentifier | Census_PrimaryDiskTotalCapacity | Census_SystemVolumeTotalCapacity | Census_TotalPhysicalRAM | Census_InternalPrimaryDiagonalDisplaySizeInInches | Census_OSBuildRevision | Census_OSInstallTypeName | Census_OSInstallLanguageIdentifier | Census_FirmwareManufacturerIdentifier | Census_FirmwareVersionIdentifier | Wdft_RegionIdentifier | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.363289 | -0.947542 | 2.687518 | -1.405365 | -1.378746 | -0.330496 | -0.239250 | -0.298198 | 1.162692 | 1.059598 | 0.436024 | 0.385986 | -0.261549 | -0.788344 | -0.001010 | 1.764481 | 0.418202 | 0.106647 | 0.895222 | 1.328502 | 1.302601 | -0.279175 | 0.270382 | 0.168204 |
| 1 | -2.222032 | -1.300421 | -0.722467 | 0.721652 | -0.195324 | -0.331622 | 2.134299 | 1.783846 | -1.193242 | -1.249199 | 0.436024 | 0.385986 | -0.373138 | -0.738270 | -0.001085 | -1.000243 | -0.406135 | -0.454777 | -0.914559 | -0.098121 | 1.302601 | -1.025708 | -0.461681 | -0.014470 |
| 2 | -2.351148 | -1.151868 | 2.235865 | -1.321083 | 1.202320 | -0.364150 | -0.457982 | -0.601796 | -0.398772 | -0.256914 | 0.436024 | 0.631297 | 0.330406 | -0.739483 | -0.001010 | 1.713965 | -0.406135 | -0.199584 | 0.151251 | -1.155319 | 1.302601 | -0.279175 | 0.491196 | -1.576833 |
| 3 | -2.403825 | -1.196187 | -0.810817 | -1.265009 | -0.390214 | 0.581644 | -0.141997 | -0.385551 | 1.162692 | 1.059598 | 0.436024 | -1.015986 | -0.393543 | -0.822050 | -0.001103 | -0.691478 | -0.818304 | 0.565994 | 0.895222 | 1.328502 | -0.302571 | -1.313945 | 0.178727 | 1.404262 |
| 4 | 0.363289 | 0.855322 | -0.344900 | 0.721652 | -0.667220 | -0.242141 | 2.134299 | 1.783846 | 1.162692 | 1.059598 | 0.436024 | 0.442896 | -0.334454 | -0.672314 | -0.001065 | 0.304518 | 2.066875 | 2.607537 | -0.256413 | 1.328502 | 1.302601 | -0.495993 | 0.634186 | -1.899196 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 880203 | 0.363289 | -1.189750 | 0.982328 | -1.414682 | 0.289092 | 0.006379 | 2.134299 | 1.783846 | -1.203592 | -1.304578 | 0.436024 | 0.442896 | -0.390355 | -0.752456 | -0.001093 | -0.790290 | -0.818304 | -0.284649 | -0.943657 | -0.098121 | 1.302601 | -0.495993 | -0.369324 | -0.014470 |
| 880204 | 0.363289 | -1.218308 | 0.100414 | 0.721652 | 0.289092 | -0.324193 | 2.134299 | 1.783846 | -1.111628 | -1.196394 | 0.436024 | -1.342150 | 1.088576 | -0.799377 | -0.001085 | -0.220704 | -0.818304 | 0.906251 | -0.911289 | -0.010080 | 1.302601 | 1.353931 | -0.257744 | -0.014470 |
| 880205 | 0.363289 | -1.189750 | -0.442759 | 0.721652 | 0.706949 | -0.342539 | -0.141997 | -0.385551 | -1.171436 | -1.225039 | -2.185498 | -1.545868 | -0.394287 | -0.830658 | -0.001065 | -0.451378 | -0.406135 | 0.855213 | -0.888813 | -0.098121 | -0.302571 | 1.353931 | -0.366745 | -0.371381 |
| 880206 | 0.494191 | -1.189750 | -0.790611 | -1.414682 | 1.180778 | -0.336574 | 0.008666 | -0.249204 | -1.203541 | -1.304464 | 0.436024 | -1.627140 | -0.393968 | -0.806894 | -0.001093 | -0.463218 | -0.818304 | -0.199584 | -0.920002 | -0.098121 | -0.698475 | -1.385781 | 0.012999 | 0.091632 |
| 880207 | 0.363289 | 0.855322 | -0.484755 | 0.721652 | 0.273902 | -0.029976 | -0.509433 | -0.616827 | -1.182904 | -1.105984 | 0.436024 | -1.629253 | -0.288543 | -0.235473 | -0.001117 | -1.067633 | -0.818304 | -1.033214 | -0.509088 | -1.155319 | -0.792638 | 1.353931 | 0.245769 | 1.404262 |
880208 rows × 24 columns
Train various classifiers using subsets of the features available
**QUESTIONS**:
We already faced this problem --> for more details section 'Cleaning of the dataset' #3 #7 In brief if the percentage of Null values was more than 98% we decided to drop them because those variables did not provide a usefull information. While fot all the other we decided to do a prediction with a linear regression model trained on all other features for the numerical features, instead for categorical features we used the 'Mode'.
We can answer to this questions in two ways:
XGBoost was our final choice. Looking at FinalResult
In case of Supervised Learning we decided to use the confusion matrix and the following metrics that we can derive from that:
and in particular for our aim we saw that the most important metrics for us is Recall.
For more details look at the part RESULT int the section 'Analysis of the result obtained'
We decide to do our analysis with different dabatabase that were optimazed and cleaned in different ways, we use:
from sklearn.model_selection import cross_val_score
X = df.loc[:, ~df.columns.isin(['HasDetections'])]
y = df['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
lsvc = LinearSVC(verbose=0)
print(cross_val(lsvc, X_train, y_train))
C:\Users\lr999\anaconda3\lib\site-packages\sklearn\svm\_base.py:1206: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. C:\Users\lr999\anaconda3\lib\site-packages\sklearn\svm\_base.py:1206: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
Accuracy: 0.617 (0.001)
C:\Users\lr999\anaconda3\lib\site-packages\sklearn\svm\_base.py:1206: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
lsvc.fit(X_train, y_train)
C:\Users\lr999\anaconda3\lib\site-packages\sklearn\svm\_base.py:1206: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
LinearSVC()
y_pred = lsvc.predict(X_test)
# Confusion Matrix
pd.crosstab(y_test, y_pred, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 81939 | 50020 |
| 1 | 50320 | 81784 |
# create a table where keep track of the different result obtained
result['LinearSVM'] = ([accuracy_score(y_pred, y_test),
precision_score(y_pred, y_test),
recall_score(y_pred, y_test),
f1_score(y_pred, y_test),
])
X = df.loc[:, ~df.columns.isin(['HasDetections'])]
y = df['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
import statsmodels.api as sm
from sklearn import metrics
logit_model=sm.Logit(y_train,X_train)
result_=logit_model.fit()
p_values = result_.summary2().tables[1]['P>|z|']
df_new = p_value_drop(df, p_values)
Optimization terminated successfully.
Current function value: 0.649479
Iterations 21
['AVProductStatesIdentifier', 'CityIdentifier', 'OrganizationIdentifier', 'OsSuite', 'Census_InternalPrimaryDisplayResolutionVertical', 'Census_PowerPlatformRoleName', 'Census_InternalBatteryType', 'Census_OSWUAutoUpdateOptionsName']
# re create the division on the new_df
X = df_new.loc[:, ~df_new.columns.isin(['HasDetections'])]
y = df_new['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
# fix the implementation with the new adjustments
logit_model_new =sm.Logit(y_train,X_train)
result_=logit_model_new.fit()
p_values = result_.summary2().tables[1]['P>|z|']
df_new = p_value_drop(df, p_values)
Optimization terminated successfully.
Current function value: 0.649488
Iterations 19
['Census_MDC2FormFactor']
# re create the division on the new_df
X = df_new.loc[:, ~df_new.columns.isin(['HasDetections'])]
y = df_new['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
# create model
logreg2 = LogisticRegression(random_state=42, solver='saga', max_iter=25000)
# evaluate model
print(cross_val(logreg2, X_train, y_train))
Accuracy: 0.617 (0.001)
logreg2.fit(X_train, y_train)
LogisticRegression(max_iter=25000, random_state=42, solver='saga')
from sklearn.metrics import roc_auc_score
#roc curve
y_proba = logreg2.predict_proba(X_test)
logit_roc_auc = roc_auc_score(y_test, logreg2.predict(X_test))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_proba[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' %logit_roc_auc)
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
# The optimal cut off would be where tpr is high and fpr is low
# tpr - (1-fpr) is zero or near to zero is the optimal cut off point
print("The AUC value is:", logit_roc_auc) # AUC area
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Threshold value is:", optimal_threshold)
The AUC value is: 0.6200062672800749 Threshold value is: 0.5000751413142316
y_pred = np.where(logreg2.predict_proba(X_test)[:,1] > optimal_threshold, 1, 0)
# Confusion Matrix
pd.crosstab(y_test, y_pred, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 81589 | 50370 |
| 1 | 49962 | 82142 |
# create a table where keep track of the different result obtained
# create a table where keep track of the different result obtained
result['LogisticRegression'] = ([accuracy_score(y_pred, y_test),
precision_score(y_pred, y_test),
recall_score(y_pred, y_test),
f1_score(y_pred, y_test),
])
# !!!!
X = df.loc[:, ~df.columns.isin(['HasDetections'])]
y = df['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
import statsmodels.api as sm
from sklearn import metrics
logit_model=sm.Logit(y_train,X_train)
result_=logit_model.fit_regularized(method='l1')
print(result_.summary2())
p_values = result_.summary2().tables[1]['P>|z|']
df_new = p_value_drop(df, p_values)
Optimization terminated successfully (Exit mode 0)
Current function value: 0.6497299728679805
Iterations: 152
Function evaluations: 152
Gradient evaluations: 152
Results: Logit
===================================================================================================
Model: Logit Pseudo R-squared: 0.063
Dependent Variable: HasDetections AIC: 800773.7483
Date: 2021-12-20 12:01 BIC: 801442.2913
No. Observations: 616145 Log-Likelihood: -4.0033e+05
Df Model: 58 LL-Null: -4.2708e+05
Df Residuals: 616086 LLR p-value: 0.0000
Converged: 1.0000 Scale: 1.0000
No. Iterations: 152.0000
---------------------------------------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
---------------------------------------------------------------------------------------------------
EngineVersion 0.1424 0.0033 43.1669 0.0000 0.1360 0.1489
AppVersion 0.0215 0.0033 6.4537 0.0000 0.0150 0.0280
AvSigVersion 0.0284 0.0027 10.5543 0.0000 0.0231 0.0337
RtpStateBitfield 0.0183 0.0046 3.9769 0.0001 0.0093 0.0274
IsSxsPassiveMode -0.0949 0.0353 -2.6898 0.0071 -0.1641 -0.0258
DefaultBrowsersIdentifier -0.0363 0.0031 -11.7832 0.0000 -0.0423 -0.0303
AVProductStatesIdentifier 0.0613 0.0088 6.9927 0.0000 0.0441 0.0785
AVProductsInstalled 0.2708 0.0085 31.7381 0.0000 0.2541 0.2875
AVProductsEnabled 0.0307 0.0030 10.1758 0.0000 0.0248 0.0366
CountryIdentifier 0.0639 0.0030 21.2528 0.0000 0.0580 0.0698
CityIdentifier -0.0044 0.0027 -1.6221 0.1048 -0.0097 0.0009
OrganizationIdentifier -0.0009 0.0027 -0.3512 0.7254 -0.0062 0.0043
GeoNameIdentifier -0.0204 0.0042 -4.8023 0.0000 -0.0287 -0.0121
LocaleEnglishNameIdentifier 0.0436 0.0046 9.4087 0.0000 0.0345 0.0527
Platform -0.0132 0.0034 -3.8255 0.0001 -0.0200 -0.0064
Processor 0.0515 0.0039 13.2641 0.0000 0.0439 0.0591
OsSuite 0.0188 0.0074 2.5363 0.0112 0.0043 0.0333
OsPlatformSubRelease -0.0538 0.0083 -6.4690 0.0000 -0.0701 -0.0375
OsBuildLab 0.0615 0.0084 7.3607 0.0000 0.0452 0.0779
SkuEdition 0.0149 0.0074 1.9986 0.0456 0.0003 0.0295
IsProtected 0.1953 0.0165 11.8615 0.0000 0.1631 0.2276
IeVerIdentifier -0.0497 0.0098 -5.0715 0.0000 -0.0689 -0.0305
SmartScreen -0.4053 0.0029 -138.7892 0.0000 -0.4110 -0.3996
Firewall 0.0808 0.0166 4.8580 0.0000 0.0482 0.1134
Census_MDC2FormFactor -0.0157 0.0060 -2.6273 0.0086 -0.0273 -0.0040
Census_OEMNameIdentifier 0.0379 0.0032 11.9678 0.0000 0.0317 0.0441
Census_OEMModelIdentifier 0.0199 0.0029 6.7465 0.0000 0.0141 0.0257
Census_ProcessorCoreCount 0.0514 0.0035 14.5140 0.0000 0.0445 0.0583
Census_ProcessorManufacturerIdentifier 0.0049 0.0028 1.7370 0.0824 -0.0006 0.0105
Census_ProcessorModelIdentifier 0.0570 0.0033 17.2150 0.0000 0.0505 0.0635
Census_PrimaryDiskTotalCapacity 0.0103 0.1989 0.0519 0.9586 -0.3796 0.4002
Census_PrimaryDiskTypeName 0.0482 0.0033 14.3901 0.0000 0.0416 0.0548
Census_SystemVolumeTotalCapacity -0.0138 0.0033 -4.2508 0.0000 -0.0202 -0.0075
Census_HasOpticalDiskDrive 0.1239 0.0101 12.2760 0.0000 0.1041 0.1437
Census_TotalPhysicalRAM 0.0364 0.0039 9.3571 0.0000 0.0287 0.0440
Census_ChassisTypeName 0.0383 0.0035 10.8218 0.0000 0.0313 0.0452
Census_InternalPrimaryDiagonalDisplaySizeInInches 0.0117 0.0033 3.5111 0.0004 0.0052 0.0183
Census_InternalPrimaryDisplayResolutionVertical 0.0073 0.0034 2.1182 0.0342 0.0005 0.0140
Census_PowerPlatformRoleName 0.0060 0.0079 0.7601 0.4472 -0.0094 0.0214
Census_InternalBatteryType -0.0000 0.0027 -0.0092 0.9927 -0.0054 0.0053
Census_InternalBatteryNumberOfCharges 0.0348 0.0067 5.1721 0.0000 0.0216 0.0479
Census_OSBuildRevision 0.0467 0.0038 12.3758 0.0000 0.0393 0.0541
Census_OSEdition -0.0487 0.0090 -5.3861 0.0000 -0.0664 -0.0310
Census_OSSkuName 0.0317 0.0091 3.4658 0.0005 0.0138 0.0496
Census_OSInstallTypeName 0.0516 0.0029 17.6436 0.0000 0.0458 0.0573
Census_OSInstallLanguageIdentifier 0.0161 0.0035 4.6648 0.0000 0.0093 0.0229
Census_OSWUAutoUpdateOptionsName 0.0034 0.0034 1.0049 0.3149 -0.0033 0.0101
Census_GenuineStateName 0.0082 0.0029 2.8195 0.0048 0.0025 0.0139
Census_ActivationChannel -0.0175 0.0029 -6.0704 0.0000 -0.0231 -0.0118
Census_FlightRing 0.0199 0.0029 6.9645 0.0000 0.0143 0.0255
Census_FirmwareManufacturerIdentifier 0.0298 0.0030 10.0467 0.0000 0.0240 0.0356
Census_FirmwareVersionIdentifier -0.0232 0.0031 -7.5247 0.0000 -0.0293 -0.0172
Census_IsSecureBootEnabled 0.0676 0.0070 9.6857 0.0000 0.0539 0.0812
Census_IsTouchEnabled -0.1439 0.0118 -12.1448 0.0000 -0.1671 -0.1207
Census_IsPenCapable 0.0863 0.0172 5.0165 0.0000 0.0526 0.1201
Census_IsAlwaysOnAlwaysConnectedCapable -0.3499 0.0217 -16.1594 0.0000 -0.3923 -0.3074
Wdft_IsGamer 0.1656 0.0062 26.7143 0.0000 0.1535 0.1778
Wdft_RegionIdentifier 0.0289 0.0029 10.1129 0.0000 0.0233 0.0345
Replaced -0.3096 0.0200 -15.4963 0.0000 -0.3488 -0.2705
===================================================================================================
['CityIdentifier', 'OrganizationIdentifier', 'Census_ProcessorManufacturerIdentifier', 'Census_PrimaryDiskTotalCapacity', 'Census_PowerPlatformRoleName', 'Census_InternalBatteryType', 'Census_OSWUAutoUpdateOptionsName']
# re create the division on the new_df
X = df_new.loc[:, ~df_new.columns.isin(['HasDetections'])]
y = df_new['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
# create model
logreg2 = LogisticRegression(penalty= 'l1', solver = 'saga', random_state=42)
# evaluate model
print(cross_val(logreg2, X_train, y_train))
C:\Users\lr999\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
Accuracy: 0.617 (0.001)
C:\Users\lr999\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
logreg2.fit(X_train, y_train)
LogisticRegression(penalty='l1', random_state=42, solver='saga')
from sklearn.metrics import roc_auc_score
#curve roc
y_proba = logreg2.predict_proba(X_test)
logit_roc_auc = roc_auc_score(y_test, logreg2.predict(X_test))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_proba[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' %logit_roc_auc)
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
# The optimal cut off would be where tpr is high and fpr is low
# tpr - (1-fpr) is zero or near to zero is the optimal cut off point
print("The AUC value is:", logit_roc_auc) # AUC area
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Threshold value is:", optimal_threshold)
The AUC value is: 0.620010197739846 Threshold value is: 0.5019574491351927
y_pred = np.where(logreg2.predict_proba(X_test)[:,1] > optimal_threshold, 1, 0)
# Confusion Matrix
pd.crosstab(y_test, y_pred, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 82345 | 49614 |
| 1 | 50694 | 81410 |
# create a table where keep track of the different result obtained
result['LogisticRegressionLasso'] = [accuracy_score(y_pred, y_test),
precision_score(y_pred, y_test),
recall_score(y_pred, y_test),
f1_score(y_pred, y_test),
]
X = df.loc[:, ~df.columns.isin(['HasDetections'])]
y = df['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
print(cross_val(clf,X_train,y_train))
#Predict the response for test dataset
y_pred = clf.predict(X_test)
# Confusion Matrix
pd.crosstab(y_test, y_pred, rownames=['Real'], colnames=['Predicted'])
Accuracy: 0.565 (0.001)
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 74351 | 57608 |
| 1 | 57158 | 74946 |
# create a table where keep track of the different result obtained
result['DecisionTree'] = [accuracy_score(y_pred, y_test),
precision_score(y_pred, y_test),
recall_score(y_pred, y_test),
f1_score(y_pred, y_test)]
!pip install lightgbm
Requirement already satisfied: lightgbm in c:\users\lr999\anaconda3\lib\site-packages (3.3.1) Requirement already satisfied: numpy in c:\users\lr999\anaconda3\lib\site-packages (from lightgbm) (1.19.2) Requirement already satisfied: wheel in c:\users\lr999\anaconda3\lib\site-packages (from lightgbm) (0.35.1) Requirement already satisfied: scipy in c:\users\lr999\anaconda3\lib\site-packages (from lightgbm) (1.5.2) Requirement already satisfied: scikit-learn!=0.22.0 in c:\users\lr999\anaconda3\lib\site-packages (from lightgbm) (1.0.1) Requirement already satisfied: joblib>=0.11 in c:\users\lr999\anaconda3\lib\site-packages (from scikit-learn!=0.22.0->lightgbm) (0.17.0) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\lr999\anaconda3\lib\site-packages (from scikit-learn!=0.22.0->lightgbm) (2.1.0)
X = df.loc[:, ~df.columns.isin(['HasDetections'])]
y = df['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
import lightgbm
from lightgbm import LGBMClassifier #DEFINE PARAMETERS FOR LGBM MODEL AND IMPLEMENT IT
clf = lightgbm.LGBMClassifier()
clf = clf.fit(X_train, y_train)
cross_val(clf, X_train, y_train)
'Accuracy: 0.648 (0.001)'
y_pred=clf.predict(X_test)
#convert into binary values
for i in range(len(y_pred)):
if y_pred[i]>=.5: # setting threshold to .5
y_pred[i]=1
else:
y_pred[i]=0
# Confusion Matrix
pd.crosstab(y_test, y_pred, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 87529 | 44430 |
| 1 | 48066 | 84038 |
# create a table where keep track of the different result obtained
result['LightGBM'] = [accuracy_score(y_pred, y_test),
precision_score(y_pred, y_test),
recall_score(y_pred, y_test),
f1_score(y_pred, y_test)]
Define a grid for the randomized search, this will tell us about best hyperparams
#DEFINE XTEST YTEST ETC
X = df.loc[:, ~df.columns.isin(['HasDetections'])]
y = df['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
# Create a based model
forest = RandomForestClassifier(random_state=42)
forest.fit(X_train, y_train)
RandomForestClassifier(random_state=42)
y_pred = forest.predict(X_test)
# Confusion Matrix
pd.crosstab(y_test, y_pred, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 88588 | 43371 |
| 1 | 51369 | 80735 |
result['RandomForest'] = [accuracy_score(y_pred, y_test),
precision_score(y_pred, y_test),
recall_score(y_pred, y_test),
f1_score(y_pred, y_test)]
X = df.loc[:, ~df.columns.isin(['HasDetections'])]
y = df['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
gs_NB = GaussianNB()
gs_NB.fit(X_train, y_train)
GaussianNB()
y_pred = gs_NB.predict(X_test)
# Confusion Matrix
pd.crosstab(y_test, y_pred, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 131713 | 246 |
| 1 | 131713 | 391 |
result['GaussianNB'] = [accuracy_score(y_pred, y_test),
precision_score(y_pred, y_test),
recall_score(y_pred, y_test),
f1_score(y_pred, y_test)]
X = df.loc[:, ~df.columns.isin(['HasDetections'])]
y = df['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
xgb = XGBClassifier()
xgb = xgb.fit(X_train, y_train)
C:\Users\lr999\anaconda3\lib\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
[12:08:33] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
y_pred_xg = xgb.predict(X_test)
# Confusion Matrix
pd.crosstab(y_test, y_pred_xg, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 87172 | 44787 |
| 1 | 46822 | 85282 |
result['XGBoost'] = [accuracy_score(y_pred_xg, y_test),
precision_score(y_pred_xg, y_test),
recall_score(y_pred_xg, y_test),
f1_score(y_pred_xg, y_test)]
from matplotlib.pyplot import figure
figure(figsize=(12, 10), dpi=80)
plt.bar(X.columns, xgb.feature_importances_)
plt.xticks(rotation=90)
plt.show()
from sklearn.model_selection import cross_val_score
lsvc = LinearSVC(verbose=0)
print(cross_val(lsvc, X_train_pca, y_train_pca))
C:\Users\lr999\anaconda3\lib\site-packages\sklearn\svm\_base.py:1206: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. C:\Users\lr999\anaconda3\lib\site-packages\sklearn\svm\_base.py:1206: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
Accuracy: 0.617 (0.001)
C:\Users\lr999\anaconda3\lib\site-packages\sklearn\svm\_base.py:1206: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
lsvc.fit(X_train_pca, y_train_pca)
C:\Users\lr999\anaconda3\lib\site-packages\sklearn\svm\_base.py:1206: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
LinearSVC()
y_pred = lsvc.predict(X_test_pca)
# Confusion Matrix
pd.crosstab(y_test_pca, y_pred, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 81423 | 50536 |
| 1 | 49959 | 82145 |
# create a table where keep track of the different result obtained
result['LinearSVM_pca'] = ([accuracy_score(y_pred, y_test_pca),
precision_score(y_pred, y_test_pca),
recall_score(y_pred, y_test_pca),
f1_score(y_pred, y_test_pca),
])
import statsmodels.api as sm
from sklearn import metrics
# create model
logreg2 = LogisticRegression(random_state=42, solver='saga', max_iter=25000)
# evaluate model
# print(cross_val(logreg2, X_train, y_train))
logreg2.fit(X_train_pca, y_train_pca)
LogisticRegression(max_iter=25000, random_state=42, solver='saga')
from sklearn.metrics import roc_auc_score
#curve roc
y_proba = logreg2.predict_proba(X_test_pca)
logit_roc_auc = roc_auc_score(y_test_pca, logreg2.predict(X_test_pca))
fpr, tpr, thresholds = metrics.roc_curve(y_test_pca, y_proba[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' %logit_roc_auc)
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
# The optimal cut off would be where tpr is high and fpr is low
# tpr - (1-fpr) is zero or near to zero is the optimal cut off point
print("The AUC value is:", logit_roc_auc) # AUC area
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Threshold value is:", optimal_threshold)
The AUC value is: 0.6192137307990142 Threshold value is: 0.5113629648183315
y_pred = np.where(logreg2.predict_proba(X_test_pca)[:,1] > optimal_threshold, 1, 0)
# Confusion Matrix
pd.crosstab(y_test_pca, y_pred, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 85646 | 46313 |
| 1 | 54052 | 78052 |
# create a table where keep track of the different result obtained
result['LogisticRegression_pca'] = [accuracy_score(y_pred, y_test_pca),
precision_score(y_pred, y_test_pca),
recall_score(y_pred, y_test_pca),
f1_score(y_pred, y_test_pca),
]
import lightgbm
from lightgbm import LGBMClassifier #DEFINE PARAMETERS FOR LGBM MODEL AND IMPLEMENT IT
clf = lightgbm.LGBMClassifier()
clf = clf.fit(X_train_pca, y_train_pca)
cross_val(clf, X_train_pca, y_train_pca)
'Accuracy: 0.623 (0.001)'
y_pred=clf.predict(X_test_pca)
#convert into binary values
for i in range(len(y_pred)):
if y_pred[i]>=.5: # setting threshold to .5
y_pred[i]=1
else:
y_pred[i]=0
# Confusion Matrix
pd.crosstab(y_test_pca, y_pred, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 84547 | 47412 |
| 1 | 51211 | 80893 |
# create a table where keep track of the different result obtained
result['LightGBM_pca'] = [accuracy_score(y_pred, y_test_pca),
precision_score(y_pred, y_test_pca),
recall_score(y_pred, y_test_pca),
f1_score(y_pred, y_test_pca)]
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X_train_pca,y_train_pca)
print(cross_val(clf,X_train_pca,y_train_pca))
#Predict the response for test dataset
y_pred = clf.predict(X_test_pca)
# Confusion Matrix
pd.crosstab(y_test_pca, y_pred, rownames=['Real'], colnames=['Predicted'])
Accuracy: 0.547 (0.000)
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 72239 | 59720 |
| 1 | 59538 | 72566 |
# create a table where keep track of the different result obtained
result['DecisionTree_pca'] = [accuracy_score(y_pred, y_test_pca),
precision_score(y_pred, y_test_pca),
recall_score(y_pred, y_test_pca),
f1_score(y_pred, y_test_pca)]
# !!!!!!
import statsmodels.api as sm
from sklearn import metrics
# create model
logreg2 = LogisticRegression(penalty= 'l1', solver = 'saga', random_state=42)
# evaluate model
# print(cross_val(logreg2, X_train_pca, y_train))
logreg2.fit(X_train_pca, y_train_pca)
C:\Users\lr999\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
LogisticRegression(penalty='l1', random_state=42, solver='saga')
from sklearn.metrics import roc_auc_score
#roc curve
y_proba = logreg2.predict_proba(X_test_pca)
logit_roc_auc = roc_auc_score(y_test_pca, logreg2.predict(X_test_pca))
fpr, tpr, thresholds = metrics.roc_curve(y_test_pca, y_proba[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' %logit_roc_auc)
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
# The optimal cut off would be where tpr is high and fpr is low
# tpr - (1-fpr) is zero or near to zero is the optimal cut off point
print("The AUC value is:", logit_roc_auc) # AUC area
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Threshold value is:", optimal_threshold)
The AUC value is: 0.6192364484973893 Threshold value is: 0.5113585591452141
y_pred = np.where(logreg2.predict_proba(X_test_pca)[:,1] > optimal_threshold, 1, 0)
# Confusion Matrix
pd.crosstab(y_test_pca, y_pred, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 85641 | 46318 |
| 1 | 54053 | 78051 |
y_pred = np.where(logreg2.predict_proba(X_test_pca)[:,1] > optimal_threshold, 1, 0)
# create a table where keep track of the different result obtained
result['LogisticRegressionLasso_pca'] = [accuracy_score(y_pred, y_test_pca),
precision_score(y_pred, y_test_pca),
recall_score(y_pred, y_test_pca),
f1_score(y_pred, y_test_pca),
]
result
| LinearSVM | LogisticRegression | LogisticRegressionLasso | DecisionTree | LightGBM | RandomForest | GaussianNB | XGBoost | LinearSVM_pca | LogisticRegression_pca | LightGBM_pca | DecisionTree_pca | LogisticRegressionLasso_pca | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Accuracy | 0.620015 | 0.620045 | 0.620136 | 0.565384 | 0.649720 | 0.641222 | 0.500275 | 0.653079 | 0.619428 | 0.619920 | 0.626517 | 0.548373 | 0.619898 |
| Precision | 0.619088 | 0.621798 | 0.616257 | 0.567326 | 0.636150 | 0.611147 | 0.002960 | 0.645567 | 0.621821 | 0.590838 | 0.612343 | 0.549310 | 0.590830 |
| Recall | 0.620497 | 0.619883 | 0.621337 | 0.565400 | 0.654155 | 0.650533 | 0.613815 | 0.655667 | 0.619117 | 0.627604 | 0.630474 | 0.548554 | 0.627576 |
| F1 | 0.619792 | 0.620839 | 0.618786 | 0.566361 | 0.645027 | 0.630225 | 0.005891 | 0.650578 | 0.620466 | 0.608666 | 0.621277 | 0.548932 | 0.608649 |
Define a grid for the randomized search, this will tell us about best hyperparams
# Create a based model
forest = RandomForestClassifier(random_state=42)
forest.fit(X_train_pca, y_train_pca)
RandomForestClassifier(random_state=42)
y_pred = forest.predict(X_test_pca)
# Confusion Matrix
pd.crosstab(y_test_pca, y_pred, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 85554 | 46405 |
| 1 | 54313 | 77791 |
result['RandomForest_pca'] = [accuracy_score(y_pred, y_test_pca),
precision_score(y_pred, y_test_pca),
recall_score(y_pred, y_test_pca),
f1_score(y_pred, y_test_pca)]
gs_NB = GaussianNB()
gs_NB.fit(X_train_pca, y_train_pca)
GaussianNB()
y_pred = gs_NB.predict(X_test_pca)
# Confusion Matrix
pd.crosstab(y_test_pca, y_pred, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 118282 | 13677 |
| 1 | 105589 | 26515 |
result['GaussianNB_pca'] = [accuracy_score(y_pred, y_test_pca),
precision_score(y_pred, y_test_pca),
recall_score(y_pred, y_test_pca),
f1_score(y_pred, y_test_pca)]
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train_pca, y_train_pca)
C:\Users\lr999\anaconda3\lib\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
[12:38:40] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=12,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)
y_pred = xgb.predict(X_test_pca)
# Confusion Matrix
pd.crosstab(y_test_pca, y_pred, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 83845 | 48114 |
| 1 | 50622 | 81482 |
result['XGBoost_pca'] = [accuracy_score(y_pred, y_test_pca),
precision_score(y_pred, y_test_pca),
recall_score(y_pred, y_test_pca),
f1_score(y_pred, y_test_pca)]
df_random_forest['HasDetections'] = df['HasDetections']
<ipython-input-139-13fde8050c81>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X = df_random_forest.loc[:, ~df_random_forest.columns.isin(['HasDetections'])]
y = df_random_forest['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
import statsmodels.api as sm
from sklearn import metrics
logit_model=sm.Logit(y_train,X_train)
result_=logit_model.fit()
print(result_.summary2())
p_values = result_.summary2().tables[1]['P>|z|']
df_new = p_value_drop(df, p_values)
Optimization terminated successfully.
Current function value: 0.654570
Iterations 10
Results: Logit
===================================================================================================
Model: Logit Pseudo R-squared: 0.056
Dependent Variable: HasDetections AIC: 806668.4767
Date: 2021-12-20 13:22 BIC: 806940.4264
No. Observations: 616145 Log-Likelihood: -4.0331e+05
Df Model: 23 LL-Null: -4.2708e+05
Df Residuals: 616121 LLR p-value: 0.0000
Converged: 1.0000 Scale: 1.0000
No. Iterations: 10.0000
---------------------------------------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
---------------------------------------------------------------------------------------------------
EngineVersion 0.1275 0.0030 41.9426 0.0000 0.1216 0.1335
AppVersion 0.0175 0.0032 5.5022 0.0000 0.0113 0.0237
AvSigVersion 0.0257 0.0027 9.6459 0.0000 0.0205 0.0309
AVProductStatesIdentifier 0.3054 0.0028 109.1511 0.0000 0.2999 0.3109
CountryIdentifier 0.0716 0.0028 25.3541 0.0000 0.0661 0.0771
CityIdentifier -0.0022 0.0027 -0.8162 0.4144 -0.0075 0.0031
GeoNameIdentifier -0.0246 0.0042 -5.8800 0.0000 -0.0328 -0.0164
LocaleEnglishNameIdentifier 0.0434 0.0046 9.4581 0.0000 0.0344 0.0524
OsBuildLab 0.1645 0.0063 25.9859 0.0000 0.1521 0.1769
IeVerIdentifier -0.1921 0.0066 -28.9017 0.0000 -0.2052 -0.1791
SmartScreen -0.4075 0.0029 -141.7801 0.0000 -0.4131 -0.4019
Census_OEMNameIdentifier 0.0696 0.0030 23.5062 0.0000 0.0638 0.0754
Census_OEMModelIdentifier 0.0227 0.0028 8.0028 0.0000 0.0172 0.0283
Census_ProcessorModelIdentifier 0.0625 0.0029 21.4856 0.0000 0.0568 0.0682
Census_PrimaryDiskTotalCapacity 0.0064 0.0308 0.2074 0.8357 -0.0541 0.0668
Census_SystemVolumeTotalCapacity 0.0329 0.0028 11.6409 0.0000 0.0274 0.0385
Census_TotalPhysicalRAM 0.0730 0.0032 23.0928 0.0000 0.0668 0.0791
Census_InternalPrimaryDiagonalDisplaySizeInInches 0.0404 0.0029 13.9255 0.0000 0.0347 0.0461
Census_OSBuildRevision 0.0576 0.0036 16.1825 0.0000 0.0506 0.0646
Census_OSInstallTypeName 0.0467 0.0028 16.5479 0.0000 0.0411 0.0522
Census_OSInstallLanguageIdentifier 0.0200 0.0034 5.8844 0.0000 0.0134 0.0267
Census_FirmwareManufacturerIdentifier 0.0441 0.0029 15.2984 0.0000 0.0385 0.0498
Census_FirmwareVersionIdentifier -0.0297 0.0030 -9.7622 0.0000 -0.0356 -0.0237
Wdft_RegionIdentifier 0.0307 0.0028 11.1409 0.0000 0.0253 0.0362
===================================================================================================
['CityIdentifier', 'Census_PrimaryDiskTotalCapacity']
# re create the division on the new_df
X = df_new.loc[:, ~df_new.columns.isin(['HasDetections'])]
y = df_new['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
# create model
logreg2 = LogisticRegression(random_state=42, solver='saga', max_iter=25000)
# evaluate model
print(cross_val(logreg2, X_train, y_train))
Accuracy: 0.617 (0.001)
logreg2.fit(X_train, y_train)
LogisticRegression(max_iter=25000, random_state=42, solver='saga')
from sklearn.metrics import roc_auc_score
#roc curve
y_proba = logreg2.predict_proba(X_test)
logit_roc_auc = roc_auc_score(y_test, logreg2.predict(X_test))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_proba[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' %logit_roc_auc)
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
# The optimal cut off would be where tpr is high and fpr is low
# tpr - (1-fpr) is zero or near to zero is the optimal cut off point
print("The AUC value is:", logit_roc_auc) # AUC area
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Threshold value is:", optimal_threshold)
The AUC value is: 0.6199647498662652 Threshold value is: 0.5057166875582852
y_pred = np.where(logreg2.predict_proba(X_test)[:,1] > optimal_threshold, 1, 0)
# Confusion Matrix
pd.crosstab(y_test, y_pred, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 83792 | 48167 |
| 1 | 52143 | 79961 |
# create a table where keep track of the different result obtained
result['LogisticRegressionLasso_RandomForest'] = [accuracy_score(y_pred, y_test),
precision_score(y_pred, y_test),
recall_score(y_pred, y_test),
f1_score(y_pred, y_test),
]
result
| LinearSVM | LogisticRegression | LogisticRegressionLasso | DecisionTree | LightGBM | RandomForest | GaussianNB | XGBoost | LinearSVM_pca | LogisticRegression_pca | LightGBM_pca | DecisionTree_pca | LogisticRegressionLasso_pca | RandomForest_pca | GaussianNB_pca | XGBoost_pca | LogisticRegressionLasso_RandomForest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Accuracy | 0.620015 | 0.620045 | 0.620136 | 0.565384 | 0.649720 | 0.641222 | 0.500275 | 0.653079 | 0.619428 | 0.619920 | 0.626517 | 0.548373 | 0.619898 | 0.618583 | 0.548343 | 0.626089 | 0.620129 |
| Precision | 0.619088 | 0.621798 | 0.616257 | 0.567326 | 0.636150 | 0.611147 | 0.002960 | 0.645567 | 0.621821 | 0.590838 | 0.612343 | 0.549310 | 0.590830 | 0.588862 | 0.200713 | 0.616802 | 0.605288 |
| Recall | 0.620497 | 0.619883 | 0.621337 | 0.565400 | 0.654155 | 0.650533 | 0.613815 | 0.655667 | 0.619117 | 0.627604 | 0.630474 | 0.548554 | 0.627576 | 0.626357 | 0.659708 | 0.628739 | 0.624071 |
| F1 | 0.619792 | 0.620839 | 0.618786 | 0.566361 | 0.645027 | 0.630225 | 0.005891 | 0.650578 | 0.620466 | 0.608666 | 0.621277 | 0.548932 | 0.608649 | 0.607031 | 0.307784 | 0.622713 | 0.614536 |
X = df_random_forest.loc[:, ~df_random_forest.columns.isin(['HasDetections'])]
y = df_random_forest['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
import statsmodels.api as sm
from sklearn import metrics
logit_model=sm.Logit(y_train,X_train)
result_=logit_model.fit_regularized(method='l1')
print(result_.summary2())
p_values = result_.summary2().tables[1]['P>|z|']
df_new = p_value_drop(df, p_values)
Optimization terminated successfully (Exit mode 0)
Current function value: 0.6545703367520499
Iterations: 37
Function evaluations: 38
Gradient evaluations: 37
Results: Logit
===================================================================================================
Model: Logit Pseudo R-squared: 0.056
Dependent Variable: HasDetections AIC: 806668.4803
Date: 2021-12-20 13:25 BIC: 806940.4300
No. Observations: 616145 Log-Likelihood: -4.0331e+05
Df Model: 23 LL-Null: -4.2708e+05
Df Residuals: 616121 LLR p-value: 0.0000
Converged: 1.0000 Scale: 1.0000
No. Iterations: 37.0000
---------------------------------------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
---------------------------------------------------------------------------------------------------
EngineVersion 0.1275 0.0030 41.9442 0.0000 0.1216 0.1335
AppVersion 0.0175 0.0032 5.5012 0.0000 0.0113 0.0237
AvSigVersion 0.0257 0.0027 9.6446 0.0000 0.0205 0.0309
AVProductStatesIdentifier 0.3054 0.0028 109.1514 0.0000 0.2999 0.3109
CountryIdentifier 0.0716 0.0028 25.3555 0.0000 0.0661 0.0771
CityIdentifier -0.0022 0.0027 -0.8172 0.4138 -0.0075 0.0031
GeoNameIdentifier -0.0246 0.0042 -5.8797 0.0000 -0.0328 -0.0164
LocaleEnglishNameIdentifier 0.0434 0.0046 9.4589 0.0000 0.0344 0.0524
OsBuildLab 0.1645 0.0063 25.9860 0.0000 0.1521 0.1769
IeVerIdentifier -0.1921 0.0066 -28.9018 0.0000 -0.2052 -0.1791
SmartScreen -0.4075 0.0029 -141.7799 0.0000 -0.4131 -0.4019
Census_OEMNameIdentifier 0.0696 0.0030 23.5042 0.0000 0.0638 0.0754
Census_OEMModelIdentifier 0.0227 0.0028 8.0034 0.0000 0.0172 0.0283
Census_ProcessorModelIdentifier 0.0625 0.0029 21.4843 0.0000 0.0568 0.0682
Census_PrimaryDiskTotalCapacity 0.0050 0.0158 0.3145 0.7531 -0.0260 0.0360
Census_SystemVolumeTotalCapacity 0.0329 0.0028 11.6412 0.0000 0.0274 0.0385
Census_TotalPhysicalRAM 0.0730 0.0032 23.0944 0.0000 0.0668 0.0792
Census_InternalPrimaryDiagonalDisplaySizeInInches 0.0404 0.0029 13.9262 0.0000 0.0347 0.0461
Census_OSBuildRevision 0.0576 0.0036 16.1830 0.0000 0.0506 0.0646
Census_OSInstallTypeName 0.0467 0.0028 16.5469 0.0000 0.0411 0.0522
Census_OSInstallLanguageIdentifier 0.0200 0.0034 5.8833 0.0000 0.0134 0.0267
Census_FirmwareManufacturerIdentifier 0.0441 0.0029 15.2970 0.0000 0.0385 0.0498
Census_FirmwareVersionIdentifier -0.0297 0.0030 -9.7623 0.0000 -0.0356 -0.0237
Wdft_RegionIdentifier 0.0307 0.0028 11.1403 0.0000 0.0253 0.0361
===================================================================================================
['CityIdentifier', 'Census_PrimaryDiskTotalCapacity']
# re create the division on the new_df
X = df_new.loc[:, ~df_new.columns.isin(['HasDetections'])]
y = df_new['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
# create model
logreg2 = LogisticRegression(penalty= 'l1', solver = 'saga', random_state=42)
# evaluate model
print(cross_val(logreg2, X_train, y_train))
C:\Users\lr999\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
Accuracy: 0.617 (0.001)
C:\Users\lr999\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
logreg2.fit(X_train, y_train)
LogisticRegression(penalty='l1', random_state=42, solver='saga')
from sklearn.metrics import roc_auc_score
#roc curve
y_proba = logreg2.predict_proba(X_test)
logit_roc_auc = roc_auc_score(y_test, logreg2.predict(X_test))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_proba[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' %logit_roc_auc)
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
# The optimal cut off would be where tpr is high and fpr is low
# tpr - (1-fpr) is zero or near to zero is the optimal cut off point
print("The AUC value is:", logit_roc_auc) # AUC area
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Threshold value is:", optimal_threshold)
The AUC value is: 0.6199495978024425 Threshold value is: 0.5057631055970708
y_pred = np.where(logreg2.predict_proba(X_test)[:,1] > optimal_threshold, 1, 0)
# Confusion Matrix
pd.crosstab(y_test, y_pred, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 83811 | 48148 |
| 1 | 52162 | 79942 |
# create a table where keep track of the different result obtained
result['LogisticRegressionLasso_RandomForest'] = [accuracy_score(y_pred, y_test),
precision_score(y_pred, y_test),
recall_score(y_pred, y_test),
f1_score(y_pred, y_test),
]
result
| LinearSVM | LogisticRegression | LogisticRegressionLasso | DecisionTree | LightGBM | RandomForest | GaussianNB | XGBoost | LinearSVM_pca | LogisticRegression_pca | LightGBM_pca | DecisionTree_pca | LogisticRegressionLasso_pca | RandomForest_pca | GaussianNB_pca | XGBoost_pca | LogisticRegressionLasso_RandomForest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Accuracy | 0.620015 | 0.620045 | 0.620136 | 0.565384 | 0.649720 | 0.641222 | 0.500275 | 0.653079 | 0.619428 | 0.619920 | 0.626517 | 0.548373 | 0.619898 | 0.618583 | 0.548343 | 0.626089 | 0.620129 |
| Precision | 0.619088 | 0.621798 | 0.616257 | 0.567326 | 0.636150 | 0.611147 | 0.002960 | 0.645567 | 0.621821 | 0.590838 | 0.612343 | 0.549310 | 0.590830 | 0.588862 | 0.200713 | 0.616802 | 0.605144 |
| Recall | 0.620497 | 0.619883 | 0.621337 | 0.565400 | 0.654155 | 0.650533 | 0.613815 | 0.655667 | 0.619117 | 0.627604 | 0.630474 | 0.548554 | 0.627576 | 0.626357 | 0.659708 | 0.628739 | 0.624108 |
| F1 | 0.619792 | 0.620839 | 0.618786 | 0.566361 | 0.645027 | 0.630225 | 0.005891 | 0.650578 | 0.620466 | 0.608666 | 0.621277 | 0.548932 | 0.608649 | 0.607031 | 0.307784 | 0.622713 | 0.614480 |
from sklearn.model_selection import cross_val_score
X = df_random_forest.loc[:, ~df_random_forest.columns.isin(['HasDetections'])]
y = df_random_forest['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
lsvc = LinearSVC(verbose=0)
print(cross_val(lsvc, X_train, y_train))
C:\Users\lr999\anaconda3\lib\site-packages\sklearn\svm\_base.py:1206: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. C:\Users\lr999\anaconda3\lib\site-packages\sklearn\svm\_base.py:1206: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
Accuracy: 0.612 (0.001)
C:\Users\lr999\anaconda3\lib\site-packages\sklearn\svm\_base.py:1206: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
lsvc.fit(X_train, y_train)
C:\Users\lr999\anaconda3\lib\site-packages\sklearn\svm\_base.py:1206: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
LinearSVC()
y_pred = lsvc.predict(X_test)
# Confusion Matrix
pd.crosstab(y_test, y_pred, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 80824 | 51135 |
| 1 | 50721 | 81383 |
# create a table where keep track of the different result obtained
result['LinearSVM_RandomForest'] = ([accuracy_score(y_pred, y_test),
precision_score(y_pred, y_test),
recall_score(y_pred, y_test),
f1_score(y_pred, y_test),
])
X = df_random_forest.loc[:, ~df_random_forest.columns.isin(['HasDetections'])]
y = df_random_forest['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
print(cross_val(clf,X_train,y_train))
#Predict the response for test dataset
y_pred = clf.predict(X_test)
# Confusion Matrix
pd.crosstab(y_test, y_pred, rownames=['Real'], colnames=['Predicted'])
Accuracy: 0.563 (0.000)
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 74089 | 57870 |
| 1 | 57106 | 74998 |
# create a table where keep track of the different result obtained
result['DecisionTree_RandomForest'] = [accuracy_score(y_pred, y_test),
precision_score(y_pred, y_test),
recall_score(y_pred, y_test),
f1_score(y_pred, y_test)]
X = df_random_forest.loc[:, ~df_random_forest.columns.isin(['HasDetections'])]
y = df_random_forest['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
import lightgbm
from lightgbm import LGBMClassifier #DEFINE PARAMETERS FOR LGBM MODEL AND IMPLEMENT IT
clf = lightgbm.LGBMClassifier()
clf = clf.fit(X_train, y_train)
cross_val(clf, X_train, y_train)
'Accuracy: 0.644 (0.001)'
y_pred=clf.predict(X_test)
#convert into binary values
for i in range(len(y_pred)):
if y_pred[i]>=.5: # setting threshold to .5
y_pred[i]=1
else:
y_pred[i]=0
# Confusion Matrix
pd.crosstab(y_test, y_pred, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 86648 | 45311 |
| 1 | 48363 | 83741 |
# create a table where keep track of the different result obtained
result['LightGBM_RandomForest'] = [accuracy_score(y_pred, y_test),
precision_score(y_pred, y_test),
recall_score(y_pred, y_test),
f1_score(y_pred, y_test)]
X = df_random_forest.loc[:, ~df_random_forest.columns.isin(['HasDetections'])]
y = df_random_forest['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
gs_NB = GaussianNB()
gs_NB.fit(X_train, y_train)
GaussianNB()
y_pred = gs_NB.predict(X_test)
# Confusion Matrix
pd.crosstab(y_test, y_pred, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 131825 | 134 |
| 1 | 131955 | 149 |
result['GaussianNB_RandomForest'] = [accuracy_score(y_pred, y_test),
precision_score(y_pred, y_test),
recall_score(y_pred, y_test),
f1_score(y_pred, y_test)]
X = df_random_forest.loc[:, ~df_random_forest.columns.isin(['HasDetections'])]
y = df_random_forest['HasDetections']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train)
C:\Users\lr999\anaconda3\lib\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
[13:39:48] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=12,
num_parallel_tree=1, predictor='auto', random_state=42,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)
y_pred = xgb.predict(X_test)
# Confusion Matrix
pd.crosstab(y_test, y_pred, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 86026 | 45933 |
| 1 | 46911 | 85193 |
result['XGBoost_RandomForest'] = [accuracy_score(y_pred, y_test),
precision_score(y_pred, y_test),
recall_score(y_pred, y_test),
f1_score(y_pred, y_test)]
result.T
# to see which algorithms have the best performances
result.T.style.apply(highlight_max, props='color:white; background-color:purple;', axis=0)
# axis=0 so in this way we find the higher score among all the algorithms
| Accuracy | Precision | Recall | F1 | |
|---|---|---|---|---|
| LinearSVM | 0.620015 | 0.619088 | 0.620497 | 0.619792 |
| LogisticRegression | 0.620045 | 0.621798 | 0.619883 | 0.620839 |
| LogisticRegressionLasso | 0.620136 | 0.616257 | 0.621337 | 0.618786 |
| DecisionTree | 0.565384 | 0.567326 | 0.565400 | 0.566361 |
| LightGBM | 0.649720 | 0.636150 | 0.654155 | 0.645027 |
| RandomForest | 0.641222 | 0.611147 | 0.650533 | 0.630225 |
| GaussianNB | 0.500275 | 0.002960 | 0.613815 | 0.005891 |
| XGBoost | 0.653079 | 0.645567 | 0.655667 | 0.650578 |
| LinearSVM_pca | 0.619428 | 0.621821 | 0.619117 | 0.620466 |
| LogisticRegression_pca | 0.619920 | 0.590838 | 0.627604 | 0.608666 |
| LightGBM_pca | 0.626517 | 0.612343 | 0.630474 | 0.621277 |
| DecisionTree_pca | 0.548373 | 0.549310 | 0.548554 | 0.548932 |
| LogisticRegressionLasso_pca | 0.619898 | 0.590830 | 0.627576 | 0.608649 |
| RandomForest_pca | 0.618583 | 0.588862 | 0.626357 | 0.607031 |
| GaussianNB_pca | 0.548343 | 0.200713 | 0.659708 | 0.307784 |
| XGBoost_pca | 0.626089 | 0.616802 | 0.628739 | 0.622713 |
| LogisticRegressionLasso_RandomForest | 0.620129 | 0.605144 | 0.624108 | 0.614480 |
| LinearSVM_RandomForest | 0.614274 | 0.616053 | 0.614128 | 0.615089 |
| DecisionTree_RandomForest | 0.564589 | 0.567719 | 0.564455 | 0.566082 |
| LightGBM_RandomForest | 0.645259 | 0.633902 | 0.648893 | 0.641310 |
| GaussianNB_RandomForest | 0.499782 | 0.001128 | 0.526502 | 0.002251 |
| XGBoost_RandomForest | 0.648402 | 0.644893 | 0.649703 | 0.647289 |
# That is the Confusion Matrix of XGBoost (= our best algorithm)
pd.crosstab(y_test, y_pred_xg, rownames=['Real'], colnames=['Predicted'])
| Predicted | 0 | 1 |
|---|---|---|
| Real | ||
| 0 | 87172 | 44787 |
| 1 | 46822 | 85282 |
It is a performance measurement for machine learning classification problem where output can be two or more classes. It is a table with 4 different combinations of predicted and actual values.
It is extremely useful for measuring Recall, Precision, Specificity, Accuracy, and most importantly AUC-ROC curves. In particular the matrix show the number of:
According to the aim of your analysis you may be more interested in a algorithm with higher precision or higher recall, usually you always want high accurancy, so you need then to understand if it is more risky to have more false negative or false positive.
[There are errors that are systematic and some that are random. If we have only random error, good precision indicates good accuracy. Therefore, the presence systematic errors prevent us from making the conclusion that good precision means good accuracy.]
That is a brief explanation of all the metrics that we used to evaluate the different algorithms:
-Accuracy : is a measure of how close the observed value (measured value) is to the true value.
-Recall: according to the target value it tells us how many we correctly identified as having Detections.
-Precision: is how close two or more measurements are to each other.
According to our analysis we found out that we prefer to have less FalseNegative as possible, because we would like to avoid an unaccurate prediction that will let a possible targeted device exposed to an infection, so we want to have the higest Recall possible --> XGBoost is our perfect algorithm.
Because if we miss the prediction about the possible infection the machine will receive a wrong information from a trusted element (our model) that then will expose a huge risk to be infected</font>
**COMMENT**:
After analyzing the plot iportance of the best algorithm (XGBoost) we found 5 importance features:
on these columns we are going to perform more test analysis . In particular, we focus on the behaviour of these features in the group of HasDetections</font>
df_infected = df[df['HasDetections'] == 1]
df_not_infected= df[df['HasDetections'] == 0]
# function to plot the percentage on the pie_chart
def label_function(val):
if val / 100 * len(df) > 20:
return f'{val / 100 * len(df):.0f}\n{val:.0f}%'
# to get back the right name of the original classes
column ='SmartScreen'
df_infected[column] = df_infected[column].map(retr_SmartScreen.set_index(column)['real'])
df_not_infected[column] = df_not_infected[column].map(retr_SmartScreen.set_index(column)['real'])
column ='Processor'
df_infected[column] = df_infected[column].map(retr_processor.set_index(column)['real'])
df_not_infected[column] = df_not_infected[column].map(retr_processor.set_index(column)['real'])
column ='AVProductsInstalled'
df_infected[column] = df_infected[column].map(retr_AVPoductInstalled.set_index(column)['real'])
df_not_infected[column] = df_not_infected[column].map(retr_AVPoductInstalled.set_index(column)['real'])
<ipython-input-176-627486e35dae>:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-176-627486e35dae>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-176-627486e35dae>:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-176-627486e35dae>:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-176-627486e35dae>:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-176-627486e35dae>:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
#plot the pie chart
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(180, 180))
w1 = df_infected.Processor.value_counts()
w2 = df_not_infected.Processor.value_counts()
w1.plot(kind='pie', autopct=label_function, textprops={'fontsize': 120}, ax=ax1, colors=['lime','orange', 'pink'])
w2.plot(kind='pie', autopct=label_function, textprops={'fontsize': 120}, ax=ax2, colors=['lime','orange', 'pink'])
ax1.set_xlabel('Per Infected \nProcessor', size=150)
ax2.set_xlabel('Per NotInfected \nProcessor', size=150)
Text(0.5, 0, 'Per NotInfected \nProcessor')
**COMMENT**:
As we can see from this pie chart the processor 'arm64' is practically unused in both the groups, beacase it is extremelly rare to finf a machine with this kinf of processor. On the other side there is just a slightly difference of 4% in x64-x86.
Using a processor x64 increase the chance to be infected. Having a x64 or a x86 processor for a normal user does not change anything, while for experts and the amateurs it increase a lot the performance of the device. Since that we can think that who owned a x64 device maybe is someone that has more chance to incur in some malware, instead of a normal person that use the computer just to do some research on Google and write some emails. </font>
#plot the pie chart
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(15, 15))
df_infected.groupby('Wdft_IsGamer').size().plot(kind='pie', autopct=label_function, textprops={'fontsize': 15},colors=['skyblue','violet'], ax=ax1)
df_not_infected.groupby('Wdft_IsGamer').size().plot(kind='pie', autopct=label_function, textprops={'fontsize': 15},colors=['skyblue','violet'], ax=ax2)
ax1.set_xlabel('Per Infected \nGamer', size=15)
ax2.set_xlabel('Per NotInfected\nGamer', size=15)
Text(0.5, 0, 'Per NotInfected\nGamer')
**COMMENT**:
As we can see from this chart there is no so much difference in the two groups, just a 5% of difference that emphasize that for the Gamer there is a little chance in more to be infected, but the difference between the two is no so relevant, the division is almost the same </font>
#plot the pie chart
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(180, 180))
w1 = df_infected.AVProductsInstalled.value_counts()
w2 = df_not_infected.AVProductsInstalled.value_counts()
w1.plot(kind='pie', autopct=label_function, textprops={'fontsize': 100}, ax=ax1)
w2.plot(kind='pie', autopct=label_function, textprops={'fontsize': 100}, ax=ax2)
ax1.set_xlabel('Per Infected \nAVProductsInstalled', size=150)
ax2.set_xlabel('Per NotInfected \nAVProductsInstalled', size=150)
Text(0.5, 0, 'Per NotInfected \nAVProductsInstalled')
**COMMENT**:
This is the number of Antivirus products installed. In case of a single antivirus, the rate of detection is high. Installing two Antivirus products decreases the rate of detection. </font>
#plot the pie chart
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(180, 180))
w1 = df_infected.SmartScreen.value_counts()
w2 = df_not_infected.SmartScreen.value_counts()
w1.plot(kind='pie', autopct=label_function, textprops={'fontsize': 100}, ax=ax1)
w2.plot(kind='pie', autopct=label_function, textprops={'fontsize': 100}, ax=ax2)
ax1.set_xlabel('Per Infected \nSmartScreen', size=150)
ax2.set_xlabel('Per NotInfected \nSmartScreen', size=150)
Text(0.5, 0, 'Per NotInfected \nSmartScreen')
**COMMENT**:
SmartScreen was the most important feature and it helps to identify reported phishing and malware websites and also helps you make informed decisions about downloads. We have see from the analysis if it exists and is not set can have a large number of detections!
SmartScreen is a cloud-based anti-phishing and anti-malware component included in several Microsoft products.
The 2 important classes for our analysis are:
All the other classes have almost the same percentage in the two groups. In this case we can see that device with less RequireAdmin are more suitable for an infection, that is because the different browser will downloading files and programs without asking for the authorization of the admin, maybe because the admin had lowered the guard of the SmartScreen,
SmartScreen checks files that you download from the web against a list of reported malicious software sites and programs known to be unsafe.
This is the SmartScreen enabled string value from registry. We can see that if it exists and is not set can have a large number of detections!</font>
**QUESTIONS**:
At least we were able to implement with all the data only the four different type of Kmeans. And in this section can be found the plot. Where were implemented the PCA -> we put on the axes the Component1 and the Component2 while in the other cases -> we used 'InternalPrimaryDiagonalDisplaySizeInInches' and 'TotalPhysicalRAM'
Actually in the one plotted in Kmeans_pca it seems that the algorithm divide the data in two groups, but they are no well separeted, it is all a big block.
For a detailed answer to this question look at Final Result Cluster)
For this dataset instead of the multivariate anomaly detection in oder to individualize outliers we used the boxplot, but in the other one we implemented the IsolationForest to detect and see the outliers present.
We have lots of outliers in:
High values of all these columns can be a sign hat the owner of that kind of device are experts or person that for work need big machine able to implement huge systems.
In this other columns the number of outliers is sligtly less, so here maybe they are just noise generated during the collection of data.
We decide to do our analysis with different dabatabase that were optimazed and cleaned in different ways, we use:
DF_CLEAN = a database cleaned from:
PCA = a database cleaned from the previous things and in addition was performed a Principal Component Analysis
true_labels = df['HasDetections'].values
def elbow_method_on_homoCHwcss (true_labels, dataset): #plots homogeneity score of the dataset
random_state = 42
homogeneity_values = []
wcss = []
ch_values = []
completeness = []
x_range = range(2, 10)
for k in x_range:
t0 = time.time()
kmeans = KMeans(n_clusters=k, random_state=random_state).fit(dataset)
homogeneity_values.append(homogeneity_score(true_labels, kmeans.labels_))
ch_values.append(calinski_harabasz_score(dataset, kmeans.labels_))
wcss.append(kmeans.inertia_)
completeness.append(completeness_score(true_labels, kmeans.labels_ ))
#print("k = %2d; Elapsed time = %.2f s" % (k, time.time()-t0))
fig, axs = plt.subplots(2, 2,figsize=(12,12))
axs[0, 0].plot(x_range, homogeneity_values)
axs[0, 0].set_xlabel('Number of Cluster')
axs[0, 0].set_ylabel('Homogeneity')
axs[0, 0].set_title('Homogeneity')
axs[0, 1].plot(x_range, ch_values)
axs[0, 1].set_xlabel('Number of Cluster')
axs[0, 1].set_ylabel('CH index')
axs[0, 1].set_title('Elbow method on CH index')
axs[1, 0].plot(x_range, wcss, marker = 'x', linestyle = '--')
axs[1, 0].set_xlabel('Number of Cluster')
axs[1, 0].set_ylabel('WCSS')
axs[1, 0].set_title('Elbow method on WCSS')
axs[1, 1].plot(x_range, completeness)
axs[1, 1].set_xlabel('Number of Cluster')
axs[1, 1].set_ylabel('Completeness')
axs[1, 1].set_title('Completeness Score')
plt.show()
# for defining the Kmeans algorithm
def kmeans_clustering (df, k):
kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42)
kmeans.fit(df)
return kmeans
# for plotting with the kmenas
def plot_kmeans_2features(kmeans, df, feature1, feature2):
df_kmeans=df
df_kmeans['Cluster_kmeans'] = kmeans.labels_
# visualize the data
x_axis = df_kmeans[feature1]
y_axis = df_kmeans[feature2]
plt.figure(figsize=(10,10))
sns.scatterplot(x_axis, y_axis, hue=df_kmeans['Cluster_kmeans'])
plt.title('Clusters KMEANS')
plt.show()
return df_kmeans
# to see the result of the algorithms: GMM+Birch+DBSCAN
def show_result (true_labels, label,database):
print ("The score of Homogeneity is: " , homogeneity_score(true_labels, label))
print ("\nThe score of Calinski is: " , calinski_harabasz_score(database, label))
print ("\nThe score of Completeness is: " , completeness_score(true_labels, label))
print ("\nThe score of Rand is: " , rand_score(true_labels, label))
print ("\nThe score of Adjusted_Rand is: " , adjusted_rand_score(true_labels, label))
print ("\nThe score of Mallows is: " , fowlkes_mallows_score(true_labels, label))
result_cluster = pd.DataFrame(index=["Homogeneity", "CHscore", "Completeness", "Rand", "Rand_adjusted", "Fowlkes-Mallows"])
Kmeans is a centroid-based algorithm.
It separates data points based on multiple centroids in the data. Each data point is assigned to a cluster based on its squared distance from the centroid. It's a little sensitive to the initial parameters you give it, but it's fast and efficient and very easy to use.
One of the problems with k-means is that the data needs to follow a circular format. The way k-means calculates the distance between data points has to do with a circular path, so non-circular data isn't clustered correctly.
Moreover we implement the PCA in order to add a Feature Selection and since we knw that Kmeans is influenced by binary columns we decide to try to run the algorithm deleting the binary comulns before.
So last we obtained 4 different kind of Kmeans, according to all the possible combinations that we could have done.
Ploting different measurement in order to see the best K to use
elbow_method_on_homoCHwcss(true_labels=true_labels, dataset=df)
kmeans = kmeans_clustering(df, 2)
df_kmeans_2 = plot_kmeans_2features(kmeans, df, 'Census_InternalPrimaryDiagonalDisplaySizeInInches', 'Census_TotalPhysicalRAM')
result_cluster['Kmeans_clean'] = [homogeneity_score(true_labels, kmeans.labels_),
calinski_harabasz_score(df, kmeans.labels_),
completeness_score(true_labels, kmeans.labels_ ),
rand_score(true_labels, kmeans.labels_),
adjusted_rand_score(true_labels, kmeans.labels_),
fowlkes_mallows_score(true_labels, kmeans.labels_)]
kmeans = kmeans_clustering(df, 6)
df_km_nobin_6 = plot_kmeans_2features(kmeans, df, 'Census_InternalPrimaryDiagonalDisplaySizeInInches', 'Census_TotalPhysicalRAM' )
result_cluster['Kmeans_clean_k=6'] = [homogeneity_score(true_labels, kmeans.labels_),
calinski_harabasz_score(df, kmeans.labels_),
completeness_score(true_labels, kmeans.labels_),
rand_score(true_labels, kmeans.labels_),
adjusted_rand_score(true_labels, kmeans.labels_),
fowlkes_mallows_score(true_labels, kmeans.labels_)]
binary_columns = [j for j in df.columns if df[j].nunique() == 2]
df_nobinary = df.drop(binary_columns, axis=1)
elbow_method_on_homoCHwcss(true_labels, df_nobinary) #2 and 7
We know that thare are two main gropus in our dataset HasDetections and NotHasDetections, so we implement the algorithm with K=2 even if the best K according to:
kmeans = kmeans_clustering(df_nobinary, 2)
df_km_nobin_2 = plot_kmeans_2features(kmeans, df_nobinary, 'Census_InternalPrimaryDiagonalDisplaySizeInInches', 'Census_TotalPhysicalRAM' )
result_cluster['Kmeans_nobin_k=2'] = [homogeneity_score(true_labels, kmeans.labels_),
calinski_harabasz_score(df, kmeans.labels_),
completeness_score(true_labels, kmeans.labels_ ),
rand_score(true_labels, kmeans.labels_),
adjusted_rand_score(true_labels, kmeans.labels_),
fowlkes_mallows_score(true_labels, kmeans.labels_)]
kmeans = kmeans_clustering(df_nobinary, 3)
df_km_nobin_3 = plot_kmeans_2features(kmeans, df_nobinary, 'Census_InternalPrimaryDiagonalDisplaySizeInInches', 'Census_TotalPhysicalRAM' )
result_cluster['Kmeans_nobin_k=3'] = [homogeneity_score(true_labels, kmeans.labels_),
calinski_harabasz_score(df, kmeans.labels_),
completeness_score(true_labels, kmeans.labels_ ),
rand_score(true_labels, kmeans.labels_),
adjusted_rand_score(true_labels, kmeans.labels_),
fowlkes_mallows_score(true_labels, kmeans.labels_)]
In the previous PCA (the one implemented in the DATA CLEANING) we already implemented it without giving the label so there is no need to redo it, we will use the same X_train calculated before
# here we used 'y_train_pca' beacuse otherwise the true_labels would have had a different length from the 'X_train_pca'
elbow_method_on_homoCHwcss(y_train_pca, X_train_pca)
kmeans_pca = kmeans_clustering(X_train_pca, 2)
# Before all else, we’ll create a new data frame.
# It allows us to add in the values of the separate components to our segmentation data set.
# The components’ scores are stored in the ‘scores P C A’ variable. Let’s label them Component 1, 2 and 3. In addition, we also append the ‘K means P C A’ labels to the new data frame.
df_kmeans_pca = pd.concat([df.reset_index(drop = True), pd.DataFrame(X_train_pca)], axis= 1)
# generiamo i nomi delle colonne
numbers = range(1, 46)
comp_name = []
for i in numbers:
name = ('Component_%d' %i)
comp_name.append(name)
# add the name
df_kmeans_pca.columns.values[-45: ] = [j for j in comp_name]
df_kmeans_pca.dropna(inplace=True)
df_kmeans_pca['Cluster_kmeans_pca'] = kmeans_pca.labels_
# visualize the data
x_axis_pca = df_kmeans_pca['Component_1']
y_axis_pca = df_kmeans_pca['Component_2']
plt.figure(figsize=(10,10))
sns.scatterplot(x_axis_pca, y_axis_pca, hue=df_kmeans_pca['Cluster_kmeans_pca'], palette=['g', 'c'])
plt.title('Clusters K-MEANS by PCA')
plt.show()
result_cluster['Kmeans_pca_k=2'] = [homogeneity_score(y_train_pca, kmeans_pca.labels_),
calinski_harabasz_score(df_kmeans_pca, kmeans_pca.labels_),
completeness_score(y_train_pca, kmeans_pca.labels_ ),
rand_score(y_train_pca, kmeans_pca.labels_),
adjusted_rand_score(y_train_pca, kmeans_pca.labels_),
fowlkes_mallows_score(y_train_pca, kmeans_pca.labels_)]
kmeans_pca_6 = kmeans_clustering(X_train_pca, 6) #best coulb be 6
# Before all else, we’ll create a new data frame.
# It allows us to add in the values of the separate components to our segmentation data set.
# The components’ scores are stored in the ‘scores P C A’ variable. Let’s label them Component 1, 2 and 3. In addition, we also append the ‘K means P C A’ labels to the new data frame.
df_kmeans_pca_6 = pd.concat([df.reset_index(drop = True), pd.DataFrame(X_train_pca)], axis= 1)
# generiamo i nomi delle colonne
numbers = range(1, 46)
comp_name = []
for i in numbers:
name = ('Component_%d' %i)
comp_name.append(name)
# add the name
df_kmeans_pca_6.columns.values[-45: ] = [j for j in comp_name]
df_kmeans_pca_6.dropna(inplace=True)
df_kmeans_pca_6['Cluster_kmeans_pca'] = kmeans_pca_6.labels_
# visualize the data
x_axis_pca_6 = df_kmeans_pca_6['Component_1']
y_axis_pca_6 = df_kmeans_pca_6['Component_2']
plt.figure(figsize=(10,10))
sns.scatterplot(x_axis_pca_6, y_axis_pca_6, hue=df_kmeans_pca_6['Cluster_kmeans_pca'], palette=['b', 'g', 'r', 'c', 'm','y'])
plt.title('Clusters K-MEANS by PCA')
plt.show()
result_cluster['Kmeans_pca_k=6'] = [homogeneity_score(y_train_pca, kmeans_pca_6.labels_),
calinski_harabasz_score(df_kmeans_pca_6, kmeans_pca_6.labels_),
completeness_score(y_train_pca, kmeans_pca_6.labels_ ),
rand_score(y_train_pca, kmeans_pca_6.labels_),
adjusted_rand_score(y_train_pca, kmeans_pca_6.labels_),
fowlkes_mallows_score(y_train_pca, kmeans_pca_6.labels_)]
In this case we cannot use the same result used before because we want to use a datafrme withou binary columns, since the Kmeans can be highly influeced by them. So since we change the starting dataset we need to recompute the PCA
PCA
!pip install dash
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
Requirement already satisfied: dash in c:\users\lr999\anaconda3\lib\site-packages (2.0.0) Requirement already satisfied: Flask>=1.0.4 in c:\users\lr999\anaconda3\lib\site-packages (from dash) (1.1.2) Requirement already satisfied: dash-table==5.0.0 in c:\users\lr999\anaconda3\lib\site-packages (from dash) (5.0.0) Requirement already satisfied: dash-html-components==2.0.0 in c:\users\lr999\anaconda3\lib\site-packages (from dash) (2.0.0) Requirement already satisfied: dash-core-components==2.0.0 in c:\users\lr999\anaconda3\lib\site-packages (from dash) (2.0.0) Requirement already satisfied: flask-compress in c:\users\lr999\anaconda3\lib\site-packages (from dash) (1.10.1) Requirement already satisfied: plotly>=5.0.0 in c:\users\lr999\anaconda3\lib\site-packages (from dash) (5.4.0) Requirement already satisfied: itsdangerous>=0.24 in c:\users\lr999\anaconda3\lib\site-packages (from Flask>=1.0.4->dash) (1.1.0) Requirement already satisfied: Werkzeug>=0.15 in c:\users\lr999\anaconda3\lib\site-packages (from Flask>=1.0.4->dash) (1.0.1) Requirement already satisfied: click>=5.1 in c:\users\lr999\anaconda3\lib\site-packages (from Flask>=1.0.4->dash) (7.1.2) Requirement already satisfied: Jinja2>=2.10.1 in c:\users\lr999\anaconda3\lib\site-packages (from Flask>=1.0.4->dash) (2.11.2) Requirement already satisfied: brotli in c:\users\lr999\anaconda3\lib\site-packages (from flask-compress->dash) (1.0.9) Requirement already satisfied: six in c:\users\lr999\anaconda3\lib\site-packages (from plotly>=5.0.0->dash) (1.15.0) Requirement already satisfied: tenacity>=6.2.0 in c:\users\lr999\anaconda3\lib\site-packages (from plotly>=5.0.0->dash) (8.0.1) Requirement already satisfied: MarkupSafe>=0.23 in c:\users\lr999\anaconda3\lib\site-packages (from Jinja2>=2.10.1->Flask>=1.0.4->dash) (1.1.1)
# intialize pca and
pca = PCA()
# to look at which component should we use to go on with our analysis -> look at the variance explained by each variable
pca.fit(df_nobinary)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)
px.area(
x=range(1, exp_var_cumul.shape[0] + 1),
y=exp_var_cumul,
labels={"x": "# Components", "y": "Explained Variance"}
)
pca = PCA(n_components=43)
pca.fit(df_nobinary)
# We need only the calculated resulting components scores for the elements in our data set
scores_pca_nobin = pca.transform(df_nobinary)
elbow_method_on_homoCHwcss(true_labels, scores_pca_nobin)
kmeans_nobin_pca = KMeans(n_clusters=2, init='k-means++', random_state=42)
kmeans_nobin_pca.fit(scores_pca_nobin)
# Before all else, we’ll create a new data frame.
# It allows us to add in the values of the separate components to our segmentation data set.
# The components’ scores are stored in the ‘scores P C A’ variable. Let’s label them Component 1, 2 and 3. In addition, we also append the ‘K means P C A’ labels to the new data frame.
df_kmeans_pca_nobin = df_nobinary
df_kmeans_pca_nobin = pd.concat([df.reset_index(drop = True), pd.DataFrame(scores_pca_nobin)], axis= 1)
# generiamo i nomi delle colonne
numbers = range(1, 41)
comp_name = []
for i in numbers:
name = ('Component_%d' %i)
comp_name.append(name)
# add the name
df_kmeans_pca_nobin.columns.values[-40: ] = [j for j in comp_name]
df_kmeans_pca_nobin['Cluster_kmeans_pca_nobin'] = kmeans_nobin_pca.labels_
# visualize the data
x_axis_pca_nobin = df_kmeans_pca_nobin['Component_2']
y_axis_pca_nobin = df_kmeans_pca_nobin['Component_1']
plt.figure(figsize=(10,10))
sns.scatterplot(x_axis_pca_nobin, y_axis_pca_nobin, hue=df_kmeans_pca_nobin['Cluster_kmeans_pca_nobin'], palette=['g', 'c'])
plt.title('Clusters K-MEANS by PCA and NoBinary')
plt.show()
result_cluster['Kmeans_pca_nobin1_k=2'] = [homogeneity_score(true_labels, kmeans_nobin_pca.labels_),
calinski_harabasz_score(df, kmeans_nobin_pca.labels_),
completeness_score(true_labels, kmeans_nobin_pca.labels_ ),
rand_score(true_labels, kmeans_nobin_pca.labels_),
adjusted_rand_score(true_labels, kmeans_nobin_pca.labels_),
fowlkes_mallows_score(true_labels, kmeans_nobin_pca.labels_)]
kmeans_nobin_pca_7 = KMeans(n_clusters=7, init='k-means++', random_state=42)
kmeans_nobin_pca_7.fit(scores_pca_nobin)
# Before all else, we’ll create a new data frame.
# It allows us to add in the values of the separate components to our segmentation data set.
# The components’ scores are stored in the ‘scores P C A’ variable. Let’s label them Component 1, 2 and 3. In addition, we also append the ‘K means P C A’ labels to the new data frame.
df_kmeans_pca_nobin_7 = df_nobinary
df_kmeans_pca_nobin_7 = pd.concat([df.reset_index(drop = True), pd.DataFrame(scores_pca_nobin)], axis= 1)
# generiamo i nomi delle colonne
numbers = range(1, 41)
comp_name = []
for i in numbers:
name = ('Component_%d' %i)
comp_name.append(name)
# add the name
df_kmeans_pca_nobin_7.columns.values[-40: ] = [j for j in comp_name]
df_kmeans_pca_nobin_7['Cluster_kmeans_pca_nobin'] = kmeans_nobin_pca_7.labels_
# visualize the data
x_axis_pca_nobin = df_kmeans_pca_nobin_7['Component_2']
y_axis_pca_nobin = df_kmeans_pca_nobin_7['Component_1']
plt.figure(figsize=(10,10))
sns.scatterplot(x_axis_pca_nobin, y_axis_pca_nobin, hue=df_kmeans_pca_nobin_7['Cluster_kmeans_pca_nobin'], palette=['b', 'g', 'r', 'c', 'm','y','k'])
plt.title('Clusters K-MEANS by PCA and NoBinary')
plt.show()
result_cluster['Kmeans_pca_nobin1_k=7'] = [homogeneity_score(true_labels, kmeans_nobin_pca_7.labels_),
calinski_harabasz_score(df, kmeans_nobin_pca_7.labels_),
completeness_score(true_labels, kmeans_nobin_pca_7.labels_ ),
rand_score(true_labels, kmeans_nobin_pca_7.labels_ ),
adjusted_rand_score(true_labels, kmeans_nobin_pca_7.labels_ ),
fowlkes_mallows_score(true_labels, kmeans_nobin_pca_7.labels_ )]
fig, axs = plt.subplots(2, 2,figsize=(15,15))
fig.suptitle('Kmeans Clusters')
x_axis = df_kmeans_2['Census_TotalPhysicalRAM']
y_axis = df_kmeans_2['Census_SystemVolumeTotalCapacity']
ax1 = axs[0, 0]
ax1.set_title('Clusters KMEANS')
sns.scatterplot(
x_axis, y_axis,
hue=df_kmeans_2['Cluster_kmeans'],
palette=['g', 'c'],
data=df_kmeans_2,
legend="full",
alpha=0.3,
ax=ax1
)
x_axis_nobin = df_km_nobin_2['Census_TotalPhysicalRAM']
y_axis_nobin = df_km_nobin_2['Census_SystemVolumeTotalCapacity']
ax2 = axs[0, 1]
ax2.set_title('Clusters K-MEANS with no binary')
sns.scatterplot(
x_axis_nobin, y_axis_nobin,
hue=df_km_nobin_2['Cluster_kmeans'],
palette=['g', 'c'],
data=df_km_nobin_2,
legend="full",
ax=ax2
)
x_axis_pca = df_kmeans_pca['Component_2']
y_axis_pca = df_kmeans_pca['Component_1']
ax3 = axs[1, 0]
ax3.set_title('Clusters K-MEANS by PCA')
sns.scatterplot(
x_axis_pca, y_axis_pca,
hue=df_kmeans_pca['Cluster_kmeans_pca'],
palette=['g', 'c'],
data=df_kmeans_pca,
legend="full",
alpha=0.3,
ax=ax3
)
x_axis_pca_nobin= df_kmeans_pca_nobin['Component_2']
y_axis_pca_nobin= df_kmeans_pca_nobin['Component_1']
ax4 = axs[1, 1]
ax4.set_title('Clusters K-MEANS by PCA and NoBinary')
sns.scatterplot(
x_axis_pca_nobin, y_axis_pca_nobin,
hue=df_kmeans_pca_nobin['Cluster_kmeans_pca_nobin'],
palette=['g', 'c'],
data=df_kmeans_pca_nobin,
legend="full",
alpha=0.3,
ax=ax4
)
<AxesSubplot:title={'center':'Clusters K-MEANS by PCA and NoBinary'}, xlabel='Component_2', ylabel='Component_1'>
we tried also to implement other algorithms like:
But unfortounately only the last one works with the dataset reduce to the 10%. The big issue was the size of the dataset and the low performance of our tools and computers.
Since these result of the GMM will no be comparable to the one that we obtained from the Kmeans ( that are trained on the entire cleaned dataset) we decide to avoid the step to look at the best cluster to be choosen and to plot just a graph of the possible rappresentation of the two clusters according to the diffferent models. We also add the result obtained, but we will no insert them in the datarame, because are no reliable enough.</font>
Here you do not need circular data shaped.The Gaussian mixture model uses multiple Gaussian distributions to fit arbitrarily shaped data.
A Gaussian mixture model is a probabilistic model that assumes all the data points are generated from a mixture of a finite number of Gaussian distributions with unknown parameters. One can think of mixture models as generalizing k-means clustering to incorporate information about the covariance structure of the data as well as the centers of the latent Gaussians.
WE NEED TO REDUCE THE SIZE OF THE DATAFRAME TO BE ABLE TO RUN ALL THE ALGORITHMS BELOW:
# REDUCING THE SIZE THROUGH PERCENTAGE
part_1 = df.sample(frac = 0.1, random_state= 42)
X_uns = part_1.loc[:, ~part_1.columns.isin(['HasDetections'])] # data on which we train the algorithm
y_uns = part_1['HasDetections'] # true label to calculate the evaluation metrics
from matplotlib import pyplot
model = GMM(n_components=2, random_state=42)
# fit the model
model.fit(X_uns)
# assign a cluster to each example
yhat = model.predict(X_uns)
# create scatter plot for samples from each cluster
pyplot.figure(figsize= (10,10))
pyplot.scatter(X_uns['Census_InternalPrimaryDiagonalDisplaySizeInInches'],
X_uns['Census_TotalPhysicalRAM'],
c= yhat,
cmap='viridis')
# show the plot
pyplot.title('GMM Clustering Algorithm')
pyplot.show()
show_result (y_uns, yhat, part_1)
The score of Homogeneity is: 0.002609289277364525 The score of Calinski is: 5390.84594746103 The score of Completeness is: 0.002655924490464643 The score of Rand is: 0.5017588924557445 The score of Adjusted_Rand is: 0.0035180589900579966 The score of Mallows is: 0.5077569407906355
The Balance Iterative Reducing and Clustering using Hierarchies (BIRCH) algorithm works better on large data sets than the k-means algorithm.
It breaks the data into little summaries that are clustered instead of the original data points. The summaries hold as much distribution information about the data points as possible.
!!! The main downside of the BIRCH algorithm is that it only works on numeric data values. You can't use this for categorical values unless you do some data transformations.
# define the model
birch_model = Birch(threshold=0.5,branching_factor=3, n_clusters=2)
# train the model
bclust = birch_model.fit(X_uns)
# assign each data point to a cluster
birch_result = bclust.predict(X_uns)
--------------------------------------------------------------------------- MemoryError Traceback (most recent call last) <ipython-input-242-7e4aac83c1ed> in <module> 3 4 # train the model ----> 5 bclust = birch_model.fit(X_uns) 6 7 # assign each data point to a cluster ~\anaconda3\lib\site-packages\sklearn\cluster\_birch.py in fit(self, X, y) 515 # TODO: Remove deprecated flags in 1.2 516 self._deprecated_fit, self._deprecated_partial_fit = True, False --> 517 return self._fit(X, partial=False) 518 519 def _fit(self, X, partial): ~\anaconda3\lib\site-packages\sklearn\cluster\_birch.py in _fit(self, X, partial) 579 self.subcluster_centers_ = centroids 580 --> 581 self._global_clustering(X) 582 return self 583 ~\anaconda3\lib\site-packages\sklearn\cluster\_birch.py in _global_clustering(self, X) 721 # the leaves. It assumes the centroids of the subclusters as 722 # samples and finds the final centroids. --> 723 self.subcluster_labels_ = clusterer.fit_predict(self.subcluster_centers_) 724 725 if compute_labels: ~\anaconda3\lib\site-packages\sklearn\cluster\_agglomerative.py in fit_predict(self, X, y) 1052 Cluster labels. 1053 """ -> 1054 return super().fit_predict(X, y) 1055 1056 ~\anaconda3\lib\site-packages\sklearn\base.py in fit_predict(self, X, y) 729 # non-optimized default implementation; override when a better 730 # method is possible for a given clustering algorithm --> 731 self.fit(X) 732 return self.labels_ 733 ~\anaconda3\lib\site-packages\sklearn\cluster\_agglomerative.py in fit(self, X, y) 916 """ 917 X = self._validate_data(X, ensure_min_samples=2, estimator=self) --> 918 return self._fit(X) 919 920 def _fit(self, X): ~\anaconda3\lib\site-packages\sklearn\cluster\_agglomerative.py in _fit(self, X) 999 return_distance = (distance_threshold is not None) or self.compute_distances 1000 -> 1001 out = memory.cache(tree_builder)( 1002 X, 1003 connectivity=connectivity, ~\anaconda3\lib\site-packages\joblib\memory.py in __call__(self, *args, **kwargs) 350 351 def __call__(self, *args, **kwargs): --> 352 return self.func(*args, **kwargs) 353 354 def call_and_shelve(self, *args, **kwargs): ~\anaconda3\lib\site-packages\sklearn\cluster\_agglomerative.py in ward_tree(X, connectivity, n_clusters, return_distance) 269 ) 270 X = np.require(X, requirements="W") --> 271 out = hierarchy.ward(X) 272 children_ = out[:, :2].astype(np.intp) 273 ~\anaconda3\lib\site-packages\scipy\cluster\hierarchy.py in ward(y) 824 825 """ --> 826 return linkage(y, method='ward', metric='euclidean') 827 828 ~\anaconda3\lib\site-packages\scipy\cluster\hierarchy.py in linkage(y, method, metric, optimal_ordering) 1064 result = _hierarchy.mst_single_linkage(y, n) 1065 elif method in ['complete', 'average', 'weighted', 'ward']: -> 1066 result = _hierarchy.nn_chain(y, n, method_code) 1067 else: 1068 result = _hierarchy.fast_linkage(y, n, method_code) _hierarchy.pyx in scipy.cluster._hierarchy.nn_chain() ~\anaconda3\lib\site-packages\scipy\cluster\_hierarchy.cp38-win_amd64.pyd in View.MemoryView.array_cwrapper() ~\anaconda3\lib\site-packages\scipy\cluster\_hierarchy.cp38-win_amd64.pyd in View.MemoryView.array.__cinit__() MemoryError: unable to allocate array data.
# get all of the unique clusters
birch_clusters = unique(birch_result)
birch_clusters
plt.figure(figsize=(10,10))
plt.scatter(X_uns['Census_InternalPrimaryDiagonalDisplaySizeInInches'],
X_uns['Census_TotalPhysicalRAM'],
c= birch_result,
cmap='Set1')
plt.title('Clusters BIRCH by PCA')
plt.show()
show_result (y_uns, birch_result, part_1)
This algorithm is better than k-means when it comes to working with oddly shaped data. DBSCAN stands for density-based spatial clustering of applications with noise. It's a density-based clustering algorithm, unlike k-means.
This is a good algorithm for finding outliners in a data set. It finds arbitrarily shaped clusters based on the density of data points in different regions. It separates regions by areas of low-density so that it can detect outliers between the high-density clusters.
model = DBSCAN(eps=0.1, min_samples=50)
# fit model and predict clusters
yhat = model.fit_predict(X_uns)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
pyplot.figure(figsize=(10,10))
pyplot.scatter(X_uns['Census_InternalPrimaryDiagonalDisplaySizeInInches'],
X_uns['Census_TotalPhysicalRAM'],
c= yhat,
cmap='viridis')
# show the plot
pyplot.title('DBSCAN Clustering Algorithm')
pyplot.show()
labels = model.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
show_result (y_uns, yhat, part_1)
result_cluster.T
result_cluster.T.style.apply(highlight_max, props='color:white; background-color:purple;', axis=0)
# axis=0 so among all the algorithm we find the one that reach the higher score for each metrics
| Homogeneity | CHscore | Completeness | Rand | Rand_adjusted | Fowlkes-Mallows | |
|---|---|---|---|---|---|---|
| Kmeans_clean | 0.000000 | 77377.666281 | 0.000000 | 0.499999 | -0.000001 | 0.534298 |
| Kmeans_nobin_k=2 | 0.000720 | 129575.007311 | 0.000724 | 0.500493 | 0.000985 | 0.502568 |
| Kmeans_nobin_k=3 | 0.001386 | 103410.435794 | 0.000900 | 0.500696 | 0.001392 | 0.422461 |
| Kmeans_pca_k=2 | 0.000003 | 24624.246610 | 0.000003 | 0.500000 | 0.000001 | 0.535123 |
| Kmeans_pca_nobin1_k=2 | 0.000017 | 103498.834428 | 0.000020 | 0.500009 | 0.000017 | 0.540383 |
| Kmeans_clean_k=6 | 0.003418 | 49926.598587 | 0.001505 | 0.500933 | 0.001866 | 0.328760 |
| Kmeans_pca_k=6 | 0.005532 | 16745.373999 | 0.002320 | 0.500969 | 0.001938 | 0.321019 |
| Kmeans_pca_nobin1_k=7 | 0.003245 | 62487.530675 | 0.001431 | 0.500893 | 0.001786 | 0.329344 |
Homogenity: the quality or state of being of a similar kind or of having a uniform structure or composition throughout : the quality or state of being homogeneous. It is an important characteristic that determines the extent to which a representative volume of the materials may differ in physical and mechanical properties from the average properties of the materials
Calinski-Harabasz Index is based on the idea that clusters that are themselves very compact and well-spaced from each other are good clusters.
Completeness: Are there gaps in the data?
Data completeness refers to the comprehensiveness or wholeness of the data. There should be no gaps or missing information for data to be truly complete. Sometimes incomplete data is unusable, but often it’s still used even with missing information, which can lead to costly mistakes and false conclusions.
Incomplete data is often a result of unsuccessfully collected data. For example, say a name and email address was supposed to be gathered, but there is no associated email address when the information is imported into your systems. This can happen if your business is gathering information from a survey or gating content in an attempt to get prospect contact information.
# PREPARATION OF THE DATASET TO USE FOR STATISTICS
# re adding the binary columns in order to do some statistics on them
bina = [j for j in df.columns if df[j].nunique() == 2]
df_km_nobin_2=df_km_nobin_2.drop(columns = 'Cluster_kmeans')
df_km_nobin_2 = pd.concat([df_km_nobin_2.reset_index(drop = True), df[bina]], axis= 1)
# divide the dataset in the cluster0 and cluster1
df_cluster1 = df_km_nobin_2[df_km_nobin_2['Cluster_kmeans']==1]
df_cluster0 = df_km_nobin_2[df_km_nobin_2['Cluster_kmeans']==0]
# DEFINITION OF THE DATAFRAME TO SEE THE RESULT OF THE COMPUTATION
rad = pd.DataFrame(index=["MAX", "MIN", "MEAN","MEDIAN"])
def dataframe_rad (df, column):
rad[column] = [max(df[column]),min(df[column]),np.mean(df[column]),statistics.median(df[column])]
dataframe_rad(df_cluster0, 'Census_ProcessorCoreCount')
dataframe_rad(df_cluster0, 'Census_TotalPhysicalRAM')
dataframe_rad(df_cluster0, 'Census_InternalPrimaryDiagonalDisplaySizeInInches')
dataframe_rad(df_cluster0, 'Census_SystemVolumeTotalCapacity')
rad.add_prefix('Cluster0_')
| Cluster0_Census_ProcessorCoreCount | Cluster0_Census_TotalPhysicalRAM | Cluster0_Census_InternalPrimaryDiagonalDisplaySizeInInches | Cluster0_Census_SystemVolumeTotalCapacity | |
|---|---|---|---|---|
| MAX | 59.854021 | 104.284635 | 21.321681 | 80.765077 |
| MIN | -1.443917 | -1.127430 | -2.241127 | -1.128422 |
| MEAN | 0.226612 | 0.300334 | 0.753908 | -0.104719 |
| MEDIAN | 0.004065 | -0.406135 | 0.565994 | -0.452025 |
dataframe_rad(df_cluster1, 'Census_ProcessorCoreCount')
dataframe_rad(df_cluster1, 'Census_TotalPhysicalRAM')
dataframe_rad(df_cluster1, 'Census_InternalPrimaryDiagonalDisplaySizeInInches')
dataframe_rad(df_cluster1, 'Census_SystemVolumeTotalCapacity')
rad.add_prefix('Cluster1_')
| Cluster1_Census_ProcessorCoreCount | Cluster1_Census_TotalPhysicalRAM | Cluster1_Census_InternalPrimaryDiagonalDisplaySizeInInches | Cluster1_Census_SystemVolumeTotalCapacity | |
|---|---|---|---|---|
| MAX | 5.795997 | 11.958916 | 10.552542 | 28.096524 |
| MIN | -1.443917 | -1.075909 | -2.036973 | -1.156684 |
| MEAN | -0.088941 | -0.119868 | -0.301338 | 0.044013 |
| MEDIAN | 0.004065 | -0.406135 | -0.199584 | -0.223474 |
**COMMENT**:
Moreover in ProcessorCoreCount and InternalPrimaryDiagonalDisplaySizeInInches there is a shift in the distribution.
In the Cluster1 these variables have a negative asymmetry while in Cluster0 is the opposite and they have a positive asymmetry, this is due to the huge difference in the max value that move the mean.
</font>
# DEFINITION OF THE DATAFRAME TO SEE THE RESULT OF THE COMPUTATION
percent_K = pd.DataFrame(index=["PercentageCluster0", "PercentageCluster1"])
def dataframe_percent_K (df_cluster0,df_cluster1, column):
percent_K[column] = [len(df_cluster0[df_cluster0[column]==1])/df_cluster0.shape[0]*100, len(df_cluster1[df_cluster1[column]==1])/df_cluster1.shape[0]*100]
for i in bina:
dataframe_percent_K (df_cluster0, df_cluster1, i)
percent_K.T.style.apply(highlight_max, props='color:white; background-color:purple;', axis=1)
# axis=1 so in this way we find the higher percentage for each row
IsSxsPassiveMode IsProtected Firewall Census_HasOpticalDiskDrive Census_IsSecureBootEnabled Census_IsTouchEnabled Census_IsPenCapable Census_IsAlwaysOnAlwaysConnectedCapable Wdft_IsGamer HasDetections Replaced Cluster_kmeans
| PercentageCluster0 | PercentageCluster1 | |
|---|---|---|
| IsSxsPassiveMode | 1.622656 | 1.784283 |
| IsProtected | 97.279453 | 93.519693 |
| Firewall | 96.381796 | 98.476352 |
| Census_HasOpticalDiskDrive | 7.871882 | 7.872210 |
| Census_IsSecureBootEnabled | 24.974921 | 58.218162 |
| Census_IsTouchEnabled | 13.821356 | 12.162838 |
| Census_IsPenCapable | 7.390681 | 2.399679 |
| Census_IsAlwaysOnAlwaysConnectedCapable | 10.957327 | 3.673486 |
| Wdft_IsGamer | 33.921913 | 25.200119 |
| HasDetections | 50.709612 | 49.756429 |
| Replaced | 99.887693 | 98.722542 |
| Cluster_kmeans | 0.000000 | 100.000000 |
**COMMENT**:
According to these percentage some of this variables (binary columns all set on ==1) are more present in one cluster that in the other. For instance, the most relevant differences are in these columns (descending order):
These could be seen as the more relevant features for the analysis of the cluster, because we can see a slight difference in th categorization of this variable in the two clusters created. </font>
# DEFINING THE DATASET THAT WE ARE GOING TO USE
df_infected = df[df['HasDetections'] == 1]
# SAVING THE TRUE LABELS
true_labels_inf = df_infected['HasDetections'].values
# AFTER THAT ELIMINATE THE BINARY COLUMN
bin_col_inf = [j for j in df_infected if df_infected[j].nunique()==2]
df_nobin_inf = df_infected.drop(bin_col_inf, axis=1)
df_nobin_inf = df_nobin_inf.drop('HasDetections', axis = 1)
X = df_nobin_inf.loc[:, ~df_nobin_inf.columns.isin(['HasDetections'])]
y = df_infected['HasDetections']
X_train, X_test, y_train_pca, y_test_pca = train_test_split(X, y, test_size=0.3, shuffle= True, stratify=y, random_state=42)
# intialize pca and
pca = PCA()
# to look at which component should we use to go on with our analysis -> look at the variance explained by each variable
pca.fit(X_train)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)
px.area(
x=range(1, exp_var_cumul.shape[0] + 1),
y=exp_var_cumul,
labels={"x": "# Components", "y": "Explained Variance"}
)
# We need only the calculated resulting components scores for the elements in our data set
pca = PCA(n_components=40)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
Now that we have our dataset ready, we can perform clustering
elbow_method_on_homoCHwcss(y_train_pca, X_train_pca)
Best k seems to be 4
kmeans_inf = KMeans(n_clusters=4, init='k-means++', random_state=42)
kmeans_inf.fit(X_train_pca)
# Before all else, we’ll create a new data frame.
# It allows us to add in the values of the separate components to our segmentation data set.
# The components’ scores are stored in the ‘scores P C A’ variable. Let’s label them Component 1, 2 and 3. In addition, we also append the ‘K means P C A’ labels to the new data frame.
df_kmeans_inf = df_nobin_inf
df_kmeans_inf = pd.concat([df_nobin_inf.reset_index(drop = True), pd.DataFrame(X_train_pca)], axis= 1)
df_kmeans_inf = df_kmeans_inf.dropna()
# generiamo i nomi delle colonne
numbers = range(1, 41)
comp_name = []
for i in numbers:
name = ('Component_%d' %i)
comp_name.append(name)
# add the name
df_kmeans_inf.columns.values[-40: ] = [j for j in comp_name]
df_kmeans_inf['Cluster_kmeans_pca_nobin'] = kmeans_inf.labels_
# visualize the data
x_axis_pca_nobin = df_kmeans_inf['Component_2']
y_axis_pca_nobin = df_kmeans_inf['Component_1']
plt.figure(figsize=(10,10))
sns.scatterplot(x_axis_pca_nobin, y_axis_pca_nobin, hue=df_kmeans_inf['Cluster_kmeans_pca_nobin'], palette='Set1')
plt.title('Clusters K-MEANS by PCA and NoBinary')
plt.show()
df_kmeans_inf = pd.concat([df_kmeans_inf.reset_index(drop = True), df[bin_col_inf]], axis= 1)
df_kmeans_inf.dropna(inplace=True)
df_kmeans_inf
| ProductName | EngineVersion | AppVersion | AvSigVersion | RtpStateBitfield | DefaultBrowsersIdentifier | AVProductStatesIdentifier | AVProductsInstalled | AVProductsEnabled | CountryIdentifier | ... | IsSxsPassiveMode | IsProtected | Firewall | Census_HasOpticalDiskDrive | Census_IsSecureBootEnabled | Census_IsTouchEnabled | Census_IsPenCapable | Census_IsAlwaysOnAlwaysConnectedCapable | Wdft_IsGamer | Replaced | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.10287 | -0.801525 | -0.845984 | 2.687518 | -0.13654 | 0.213221 | -0.167505 | 1.291389 | -0.151878 | -1.280485 | ... | 0 | 1.0 | 0.0 | 0 | 1 | 0 | 0 | 0.0 | 0.0 | 1 |
| 1 | -0.10287 | -0.256762 | -0.578388 | 2.235865 | -0.13654 | -4.695658 | -0.165533 | 1.291389 | -0.151878 | -1.308435 | ... | 0 | 1.0 | 1.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 1 |
| 2 | -0.10287 | -0.801525 | -0.176993 | -0.541013 | -0.13654 | -4.674335 | -0.169478 | -0.622760 | -0.151878 | -1.140738 | ... | 0 | 1.0 | 1.0 | 0 | 1 | 0 | 0 | 0.0 | 0.0 | 1 |
| 3 | -0.10287 | 0.288000 | -0.310791 | -0.094114 | -0.13654 | 0.213221 | -0.169478 | -0.622760 | -0.151878 | -1.056889 | ... | 0 | 1.0 | 1.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 1 |
| 4 | -0.10287 | -0.801525 | -0.310791 | -0.604799 | -0.13654 | 0.213221 | -0.169478 | -0.622760 | -0.151878 | -1.308435 | ... | 0 | 1.0 | 1.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 308236 | -0.10287 | -0.801525 | 1.963781 | -0.559634 | -0.13654 | 0.213221 | -0.169478 | -0.622760 | -0.151878 | -0.218403 | ... | 0 | 1.0 | 1.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 1 |
| 308237 | -0.10287 | 0.288000 | -0.310791 | 0.089321 | -0.13654 | 0.213221 | 11.692612 | 3.205538 | 4.808877 | -0.973040 | ... | 0 | 1.0 | 1.0 | 0 | 0 | 0 | 0 | 0.0 | 1.0 | 1 |
| 308238 | -0.10287 | 0.288000 | -0.310791 | -0.376992 | -0.13654 | 0.213221 | 0.736010 | 1.291389 | -0.151878 | -0.022756 | ... | 0 | 0.0 | 1.0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 1 |
| 308239 | -0.10287 | -0.801525 | -0.310791 | 0.329015 | -0.13654 | 0.213221 | 0.000178 | 1.291389 | -0.151878 | -1.140738 | ... | 0 | 1.0 | 1.0 | 0 | 1 | 0 | 0 | 0.0 | 0.0 | 1 |
| 308240 | -0.10287 | 0.288000 | -0.310791 | -0.652342 | -0.13654 | 0.213221 | -0.169478 | -0.622760 | -0.151878 | -0.022756 | ... | 0 | 1.0 | 1.0 | 0 | 1 | 0 | 0 | 0.0 | 0.0 | 1 |
308241 rows × 101 columns
df_kmeans_inf['Cluster_kmeans_pca_nobin'].value_counts()
1.0 187406 0.0 88236 2.0 29374 3.0 3225 Name: Cluster_kmeans_pca_nobin, dtype: int64
Cluster_0 = df_kmeans_inf[df_kmeans_inf['Cluster_kmeans_pca_nobin']==0]
Cluster_1 = df_kmeans_inf[df_kmeans_inf['Cluster_kmeans_pca_nobin']==1]
Cluster_2 = df_kmeans_inf[df_kmeans_inf['Cluster_kmeans_pca_nobin']==2]
Cluster_3 = df_kmeans_inf[df_kmeans_inf['Cluster_kmeans_pca_nobin']==3]
# in order to see a general distribution in a random cluster in order to individualize important features to analyse deeper
for col in Cluster_0.columns:
print(Cluster_0[col].value_counts())
-0.102870 87783
9.679376 452
29.243867 1
Name: ProductName, dtype: int64
-0.801525 40384
0.288000 37351
-0.529144 1828
0.560382 1377
1.922289 1224
1.105145 1170
1.649908 881
-0.256762 812
2.467052 753
0.832763 570
2.194670 358
3.011815 268
1.377526 252
0.015619 166
6.552773 120
5.190866 116
2.739433 68
4.101340 67
6.825155 67
3.556578 63
3.284196 46
4.373722 41
5.463248 36
6.280392 32
4.646103 30
7.097536 28
4.918485 26
5.735629 25
8.187062 16
7.642299 12
7.369918 11
6.008011 9
8.459443 8
7.914681 7
9.548969 4
10.093732 3
9.004206 2
13.362309 1
10.910876 1
12.817546 1
11.183258 1
12.272783 1
Name: EngineVersion, dtype: int64
-0.310791 55093
-0.845984 8410
-0.578388 3250
-0.444589 2732
0.090604 2008
...
6.914319 1
6.646723 1
9.456488 1
5.977731 1
9.857883 1
Name: AppVersion, Length: 75, dtype: int64
3.170471 1100
2.603130 937
3.061916 900
2.687518 883
2.863029 870
...
-0.766444 7
-0.734353 7
-0.745050 7
-0.768821 7
-0.767236 6
Name: AvSigVersion, Length: 801, dtype: int64
-0.136540 86444
3.479425 1381
14.327322 322
7.095391 56
10.711357 19
17.943288 14
Name: RtpStateBitfield, dtype: int64
0.213221 85039
-4.674335 403
-4.682339 244
-4.684310 187
-4.686476 148
...
-4.698439 2
-4.698307 2
-4.698399 2
-4.698284 1
-4.698451 1
Name: DefaultBrowsersIdentifier, Length: 99, dtype: int64
-0.169478 64664
-0.163560 3405
-0.165533 3170
-0.161587 2039
-0.153696 1130
...
10.325502 1
4.762374 1
7.080344 1
3.221663 1
1.020085 1
Name: AVProductStatesIdentifier, Length: 1453, dtype: int64
-0.622760 67559
1.291389 19375
3.205538 1249
5.119686 51
7.033835 2
Name: AVProductsInstalled, dtype: int64
-0.151878 86846
4.808877 1185
9.769632 167
14.730387 33
19.691143 5
Name: AVProductsEnabled, dtype: int64
0.508285 3967
-0.973040 3427
-0.469949 3373
-1.308435 2851
-0.581747 2684
...
4.533018 2
4.868413 1
4.840463 1
4.365321 1
4.784564 1
Name: CountryIdentifier, Length: 220, dtype: int64
4.384629 4113
-0.367414 1959
-0.367301 1508
-0.367189 1248
-0.367076 1049
...
-0.344678 17
-0.339500 15
-0.345916 15
-0.332635 14
-0.339725 10
Name: CityIdentifier, Length: 636, dtype: int64
-0.259795 68452
0.424186 17613
3.160112 636
3.844093 546
1.108168 190
5.212056 189
6.580018 142
8.631962 127
7.264000 47
9.315944 42
2.476130 39
9.999925 30
5.896037 26
7.947981 20
10.683906 17
21.627607 15
20.259645 14
15.471776 13
4.528074 12
11.367888 11
1.792149 9
16.839738 8
22.995570 6
12.051869 5
17.523719 5
20.943626 4
18.207701 3
26.415477 3
12.735850 3
14.103813 2
23.679551 2
14.787794 1
13.419832 1
22.311589 1
18.891682 1
29.151402 1
Name: OrganizationIdentifier, dtype: int64
-0.855997 15189
0.782691 4160
-0.364390 4113
-0.659354 3535
-0.823223 3371
...
4.486125 1
6.092038 1
7.566857 1
7.206346 1
6.911382 1
Name: GeoNameIdentifier, Length: 243, dtype: int64
-0.801174 21212
-0.231876 4352
0.906720 4154
-0.841839 3895
-0.597854 3480
...
6.559035 1
8.470250 1
8.022944 1
4.891805 1
5.786416 1
Name: LocaleEnglishNameIdentifier, Length: 181, dtype: int64
0.041439 86194
-5.021289 1506
5.104168 447
10.166897 89
Name: Platform, dtype: int64
0.317818 82142
-3.143507 6094
Name: Processor, dtype: int64
0.757583 54454
-1.273777 33690
2.788943 76
6.851662 8
8.883022 3
4.820303 3
12.945742 1
10.914382 1
Name: OsSuite, dtype: int64
-0.818664 41654
0.394766 24803
1.001480 7461
-0.211949 6068
2.821625 3491
1.608195 2637
-1.425379 1506
2.214910 447
3.428340 169
Name: OsPlatformSubRelease, dtype: int64
-0.382300 39536
-0.319430 13114
-0.130822 9418
-0.162257 7002
-0.287996 2119
...
8.765190 1
6.093243 1
10.305489 1
12.694525 1
12.348743 1
Name: OsBuildLab, Length: 308, dtype: int64
0.501348 54235
-1.080541 32498
2.083236 440
3.665124 433
6.828901 371
5.247013 197
8.410790 47
9.992678 15
Name: SkuEdition, dtype: int64
-0.490608 41917
-0.275585 17678
0.154462 4704
0.584508 4629
0.046950 3219
...
8.325345 1
13.485902 1
12.410786 1
10.260554 1
8.540368 1
Name: IeVerIdentifier, Length: 121, dtype: int64
-0.402406 67572
1.944780 16754
0.771187 1804
4.291967 1531
3.118374 330
5.465560 225
6.639154 9
8.986341 7
7.812747 4
Name: SmartScreen, dtype: int64
-0.540447 57476
0.210813 19876
1.713334 4066
0.962074 2818
2.464595 2273
3.215855 1137
4.718376 408
3.967116 109
5.469637 54
6.972158 18
6.220897 1
Name: Census_MDC2FormFactor, dtype: int64
1.406556 13636
0.631297 10651
0.442896 9692
0.326259 8888
0.385986 8794
...
-1.645912 4
-1.647342 3
-1.646286 3
-1.645692 3
-1.647122 1
Name: Census_OEMNameIdentifier, Length: 218, dtype: int64
3.907854 4069
-0.394924 2747
2.414576 2719
1.088576 1473
-0.394818 1455
...
-0.373244 20
-0.377283 19
-0.375370 18
-0.366230 18
-0.372713 14
Name: Census_OEMModelIdentifier, Length: 633, dtype: int64
0.004065 55141
-0.961257 20931
1.934709 9586
3.865353 1030
0.969387 772
-1.443917 402
5.795997 185
-0.478596 112
7.726640 22
9.657284 20
13.518571 17
11.587928 5
17.379859 5
0.486726 2
21.241146 1
32.825009 1
1.452048 1
4.830675 1
2.900031 1
15.449215 1
Name: Census_ProcessorCoreCount, dtype: int64
-0.363463 78044
2.745538 10192
Name: Census_ProcessorManufacturerIdentifier, dtype: int64
3.165897 3104
2.410670 2563
1.483278 2144
1.279952 1813
1.220664 1811
...
-0.826051 6
-0.811744 6
-0.817564 5
-0.827142 2
-0.821080 1
Name: Census_ProcessorModelIdentifier, Length: 614, dtype: int64
-0.001065 28314
-0.001010 23190
-0.001106 4965
-0.001092 4943
-0.001085 4026
...
-0.001118 1
-0.000999 1
-0.001113 1
-0.000459 1
-0.001094 1
Name: Census_PrimaryDiskTotalCapacity, Length: 387, dtype: int64
-0.703602 58522
1.293492 24790
2.292039 2676
0.294945 2248
Name: Census_PrimaryDiskTypeName, dtype: int64
1.686724 523
-1.069136 479
0.304567 476
0.304368 432
1.767275 416
...
-0.426873 1
-0.838017 1
1.721421 1
-0.515547 1
0.275249 1
Name: Census_SystemVolumeTotalCapacity, Length: 41979, dtype: int64
-0.406135 40581
0.418202 23762
-0.818304 8902
2.066875 5927
0.006033 4229
...
-0.605779 1
-0.586660 1
-0.586861 1
0.406529 1
-0.836819 1
Name: Census_TotalPhysicalRAM, Length: 124, dtype: int64
-0.496940 52529
-0.134199 19139
0.591282 6854
0.954023 3414
2.767726 1964
4.218689 883
1.316763 827
3.493208 499
3.130467 391
2.404985 357
2.042245 324
0.228541 283
1.679504 263
4.581430 126
6.032392 97
5.669652 87
3.855948 83
5.306911 40
6.395133 38
4.944170 21
7.483355 6
8.208837 4
10.022540 3
7.846096 2
11.473503 1
9.297059 1
Name: Census_ChassisTypeName, dtype: int64
-0.199584 31267
-0.471790 9904
-0.454777 5521
0.821187 2969
-0.863086 2848
...
3.543244 1
-1.050227 1
5.601800 1
3.185974 1
6.979842 1
Name: Census_InternalPrimaryDiagonalDisplaySizeInInches, Length: 335, dtype: int64
-0.603533 49756
0.850786 22867
0.011756 6361
0.589754 1785
-0.454372 1752
...
0.776205 1
-1.092967 1
0.543141 1
0.398642 1
-1.298063 1
Name: Census_InternalPrimaryDisplayResolutionVertical, Length: 91, dtype: int64
-0.518832 61964
0.676699 21030
1.872230 3686
5.458825 1071
3.067762 353
4.263293 55
6.654356 54
7.849888 21
9.045419 2
Name: Census_PowerPlatformRoleName, dtype: int64
-0.231953 82523
4.047851 2490
2.977900 1954
1.907949 685
0.837998 311
6.187754 83
7.257705 53
12.607460 42
5.117802 38
8.327656 22
10.467558 18
15.817313 5
17.957216 5
9.397607 2
26.516824 2
32.936531 1
11.537509 1
21.167069 1
Name: Census_InternalBatteryType, dtype: int64
-0.596106 51199
1.677555 23914
-0.596106 412
-0.596106 252
-0.596106 233
...
-0.596104 1
-0.596099 1
-0.596074 1
-0.596077 1
-0.596090 1
Name: Census_InternalBatteryNumberOfCharges, Length: 1431, dtype: int64
-0.711702 15265
-0.846049 10432
-0.778876 5531
0.094378 4991
0.228724 3641
...
6.878884 1
5.468244 1
4.695751 1
-0.006382 1
5.535417 1
Name: Census_OSBuildRevision, Length: 208, dtype: int64
-0.786470 32684
-0.015660 31404
0.755150 20356
3.067580 1659
1.525960 635
2.296770 438
6.150819 385
3.838390 280
4.609200 190
5.380009 73
6.921629 54
7.692439 42
8.463249 11
11.546488 7
10.775679 6
10.004869 4
12.317298 3
9.234059 2
13.858918 2
13.088108 1
Name: Census_OSEdition, dtype: int64
-0.842981 32684
0.024681 32044
0.892343 20350
2.627667 1658
1.760005 438
6.098314 387
3.495328 283
4.362990 190
5.230652 73
6.965976 54
7.833638 42
8.701300 11
12.171948 7
11.304286 6
10.436624 4
13.907272 2
9.568962 2
13.039610 1
Name: Census_OSSkuName, dtype: int64
-0.616835 28049
-1.171399 16523
-0.062271 14540
1.046858 12037
0.492294 7924
1.601422 6073
2.710551 1803
2.155987 795
3.265115 492
Name: Census_OSInstallTypeName, dtype: int64
-0.797672 32421
-0.676116 10465
-0.189893 4984
0.417885 4815
1.998110 4087
-0.311449 4014
-0.068337 3924
0.660997 3274
0.174774 2203
1.511887 2160
1.268775 2046
1.025664 1864
0.782553 1592
-0.554560 1316
1.147220 1283
-0.433005 1105
1.390331 895
1.876554 768
0.296330 690
2.484333 649
2.849000 479
2.362777 435
1.633443 415
2.119665 300
0.904108 297
2.605888 237
2.241221 230
3.578334 215
0.053218 186
1.754998 184
0.539441 160
2.970555 119
3.335223 112
3.092111 110
3.213667 73
3.699890 45
3.456778 43
2.727444 22
3.821445 19
Name: Census_OSInstallLanguageIdentifier, dtype: int64
-0.053624 40499
-1.218154 23976
1.110907 19720
2.275438 3696
4.604500 256
3.439969 89
Name: Census_OSWUAutoUpdateOptionsName, dtype: int64
0.178383 78392
-2.731028 7627
3.087794 2130
5.997205 87
Name: Census_GenuineStateName, dtype: int64
-0.784990 45763
0.530691 34080
1.846371 4983
3.162052 3317
4.477732 78
5.793413 15
Name: Census_ActivationChannel, dtype: int64
-0.223353 82821
1.903803 2844
4.030959 2268
10.412427 106
6.158115 86
12.539583 71
8.285271 40
Name: Census_FlightRing, dtype: int64
-0.017610 28624
-0.616342 12400
-0.516553 11961
-0.217187 9670
-0.117398 8192
...
3.175631 1
0.581123 1
12.655563 1
20.239509 1
23.033595 1
Name: Census_FirmwareManufacturerIdentifier, Length: 152, dtype: int64
5.441470 1608
-0.463791 648
-0.463556 646
-0.463322 640
0.951578 604
...
-0.407064 18
-0.424410 17
-0.418550 15
-0.432849 13
-0.420659 8
Name: Census_FirmwareVersionIdentifier, Length: 640, dtype: int64
-0.425522 21087
-1.054055 13305
-0.739789 13244
0.203011 12735
0.517277 9506
2.088609 5987
-1.368322 2788
1.774343 2007
1.145810 1951
0.831543 1505
2.402876 1501
1.460076 1226
-0.111256 721
2.717142 633
3.031408 40
Name: Wdft_RegionIdentifier, dtype: int64
1.023674 2
3.721900 2
1.041324 2
1.926380 2
1.761508 2
..
2.223835 1
2.967575 1
2.783548 1
2.798795 1
2.965416 1
Name: Component_1, Length: 88228, dtype: int64
-0.634154 2
-0.656682 2
1.094687 2
1.514445 2
2.083571 2
..
-1.075338 1
-0.245787 1
-0.053225 1
-0.306376 1
-0.515112 1
Name: Component_2, Length: 88228, dtype: int64
1.507995 2
-0.483690 2
-0.852611 2
0.338965 2
-0.420620 2
..
-1.079661 1
-0.642727 1
-0.656445 1
1.054774 1
0.177255 1
Name: Component_3, Length: 88228, dtype: int64
1.593897 2
-0.170845 2
0.192939 2
1.179458 2
0.082076 2
..
0.003226 1
1.454010 1
-0.054459 1
-0.123592 1
1.267275 1
Name: Component_4, Length: 88228, dtype: int64
2.463379 2
0.981161 2
-0.992723 2
-0.048850 2
-1.671847 2
..
0.727875 1
1.443378 1
-0.310841 1
-0.796242 1
-2.138707 1
Name: Component_5, Length: 88228, dtype: int64
-1.928939 2
1.340424 2
0.182006 2
0.202422 2
0.259071 2
..
-0.481109 1
0.583616 1
-2.408314 1
-0.544908 1
1.850966 1
Name: Component_6, Length: 88228, dtype: int64
-0.540500 2
-0.704865 2
-0.789103 2
-1.053165 2
-1.684339 2
..
-0.647074 1
0.578071 1
-1.555609 1
-1.725635 1
-2.106888 1
Name: Component_7, Length: 88228, dtype: int64
0.312250 2
0.278282 2
-0.428721 2
0.757526 2
-0.334140 2
..
-0.070287 1
-1.178097 1
-0.727761 1
-0.954939 1
0.378570 1
Name: Component_8, Length: 88228, dtype: int64
1.507408 2
0.763327 2
-2.224624 2
-0.706499 2
-3.214317 2
..
-0.249089 1
1.093403 1
1.805837 1
1.498379 1
-0.333695 1
Name: Component_9, Length: 88228, dtype: int64
-0.681219 2
7.875254 2
-0.798556 2
-1.638804 2
-0.829220 2
..
-0.388212 1
-1.496545 1
0.981635 1
0.282653 1
0.420638 1
Name: Component_10, Length: 88228, dtype: int64
1.813260 2
-0.153411 2
1.451540 2
-1.428599 2
0.850280 2
..
-0.976610 1
0.846653 1
-0.296837 1
-0.998819 1
-1.455861 1
Name: Component_11, Length: 88228, dtype: int64
1.866849 2
1.219383 2
0.811267 2
0.582039 2
-0.632077 2
..
0.574355 1
0.425829 1
-1.492462 1
0.982420 1
-0.438203 1
Name: Component_12, Length: 88228, dtype: int64
-2.728888 2
-0.467645 2
-0.061085 2
-0.342763 2
-1.725425 2
..
0.879979 1
1.674498 1
-1.215622 1
0.170020 1
0.133977 1
Name: Component_13, Length: 88228, dtype: int64
-0.181628 2
-2.732717 2
-0.912149 2
-2.232425 2
-0.536817 2
..
0.999783 1
-0.572671 1
-0.096983 1
1.090182 1
0.622780 1
Name: Component_14, Length: 88228, dtype: int64
0.513252 2
-1.809409 2
3.270563 2
-1.183006 2
1.388042 2
..
-1.236560 1
0.451498 1
-0.309596 1
-2.074253 1
-0.769169 1
Name: Component_15, Length: 88228, dtype: int64
0.417471 2
-1.491294 2
-1.085522 2
-0.965657 2
0.140237 2
..
-0.999247 1
0.966091 1
0.208426 1
0.380476 1
-1.305854 1
Name: Component_16, Length: 88228, dtype: int64
0.548771 2
-1.859444 2
-1.123402 2
-1.756709 2
0.192216 2
..
0.363440 1
0.097209 1
0.332659 1
-0.455596 1
0.556952 1
Name: Component_17, Length: 88228, dtype: int64
2.495307 2
1.416901 2
2.006518 2
2.224294 2
0.100957 2
..
0.002709 1
-0.079830 1
-0.013503 1
-1.778130 1
-0.125489 1
Name: Component_18, Length: 88228, dtype: int64
-0.441233 2
-0.704520 2
-0.201603 2
-0.803483 2
-0.240264 2
..
0.324083 1
0.957605 1
-0.694116 1
0.048245 1
0.160594 1
Name: Component_19, Length: 88228, dtype: int64
1.387024 2
0.326401 2
-0.123219 2
-0.138772 2
0.078044 2
..
1.276668 1
-0.427783 1
0.993429 1
-0.264089 1
2.620428 1
Name: Component_20, Length: 88228, dtype: int64
-1.362905 2
-0.342106 2
0.160376 2
-0.910677 2
1.671180 2
..
1.060211 1
-0.208029 1
-0.503366 1
-0.376100 1
-0.558420 1
Name: Component_21, Length: 88228, dtype: int64
-1.444979 2
0.257935 2
0.254506 2
-0.414986 2
-0.936108 2
..
-0.936793 1
-0.119678 1
0.188054 1
-0.346777 1
1.050719 1
Name: Component_22, Length: 88228, dtype: int64
0.631213 2
0.011853 2
0.153702 2
-0.172800 2
-0.413160 2
..
0.958167 1
0.288719 1
0.444784 1
-1.786381 1
0.033874 1
Name: Component_23, Length: 88228, dtype: int64
1.588632 2
-0.036864 2
0.834028 2
0.527106 2
0.182158 2
..
-0.522829 1
-0.710286 1
0.918165 1
0.083735 1
-0.583670 1
Name: Component_24, Length: 88228, dtype: int64
-1.222227 2
-0.638498 2
-1.281643 2
-0.442778 2
-0.450582 2
..
0.282041 1
0.736824 1
-0.626830 1
0.536052 1
-0.401445 1
Name: Component_25, Length: 88228, dtype: int64
-0.992024 2
-1.520290 2
0.337208 2
-0.886851 2
0.263418 2
..
-2.188151 1
-0.713702 1
-0.668634 1
-0.666991 1
0.175729 1
Name: Component_26, Length: 88228, dtype: int64
0.159141 2
1.046473 2
-0.155602 2
-0.031456 2
0.067736 2
..
0.693524 1
-2.013009 1
1.202748 1
0.206507 1
-0.743050 1
Name: Component_27, Length: 88228, dtype: int64
-0.568554 2
0.005098 2
0.366802 2
-1.920148 2
-0.058042 2
..
-0.006434 1
-0.881699 1
0.625926 1
0.736143 1
0.250350 1
Name: Component_28, Length: 88228, dtype: int64
0.259648 2
-0.162085 2
-1.517835 2
0.116228 2
2.186615 2
..
0.383161 1
-0.540255 1
-0.089197 1
-1.154694 1
-1.221248 1
Name: Component_29, Length: 88228, dtype: int64
-0.522067 2
2.219051 2
0.863242 2
-0.613439 2
0.053960 2
..
-0.153401 1
0.794578 1
-1.637274 1
0.328876 1
-0.182888 1
Name: Component_30, Length: 88228, dtype: int64
0.630751 2
1.055827 2
-0.632611 2
0.560733 2
-0.236268 2
..
-0.721094 1
0.949059 1
0.084804 1
0.364176 1
1.290330 1
Name: Component_31, Length: 88228, dtype: int64
-0.101458 2
-0.435464 2
-0.226875 2
-2.900894 2
-0.719549 2
..
0.533678 1
-1.429064 1
-0.275952 1
1.849570 1
-0.055505 1
Name: Component_32, Length: 88228, dtype: int64
-0.260094 2
-0.115224 2
-0.818232 2
0.686026 2
-0.591274 2
..
-2.151705 1
-0.308193 1
-1.033489 1
-0.495761 1
0.240161 1
Name: Component_33, Length: 88228, dtype: int64
-0.425224 2
-0.846063 2
0.983505 2
2.431495 2
-0.426987 2
..
0.126368 1
0.924967 1
0.058757 1
-1.059585 1
0.887420 1
Name: Component_34, Length: 88228, dtype: int64
0.097072 2
0.660471 2
-1.805405 2
0.267905 2
-0.873717 2
..
0.497716 1
-0.637228 1
0.443969 1
0.440962 1
0.605531 1
Name: Component_35, Length: 88228, dtype: int64
-0.083036 2
-0.110234 2
-0.047112 2
0.657380 2
-0.322396 2
..
-0.883307 1
-1.090306 1
0.675066 1
-0.698604 1
-1.052299 1
Name: Component_36, Length: 88228, dtype: int64
0.114756 2
0.857613 2
-0.093173 2
1.286203 2
0.857283 2
..
1.464185 1
0.652641 1
-0.455825 1
-0.141474 1
0.066531 1
Name: Component_37, Length: 88228, dtype: int64
-0.333955 2
-0.319614 2
0.725507 2
-2.253154 2
0.394593 2
..
0.271941 1
-0.500875 1
0.571463 1
0.316687 1
0.462327 1
Name: Component_38, Length: 88228, dtype: int64
0.423926 2
0.092285 2
0.266538 2
0.424310 2
1.542224 2
..
-0.263644 1
0.166591 1
0.418522 1
-0.182475 1
0.632933 1
Name: Component_39, Length: 88228, dtype: int64
-0.004297 2
-0.157681 2
0.194649 2
-0.025286 2
0.085506 2
..
0.393427 1
0.151024 1
-0.252227 1
-0.610581 1
-0.212815 1
Name: Component_40, Length: 88228, dtype: int64
0 88236
Name: Cluster_kmeans_pca_nobin, dtype: int64
clusters = []
clusters.append(Cluster_0)
clusters.append(Cluster_1)
clusters.append(Cluster_2)
clusters.append(Cluster_3)
# DEFINITION OF THE DATAFRAME TO SEE THE RESULT OF THE COMPUTATION
rad = pd.DataFrame(index=["MAX", "MIN", "MEAN","MEDIAN"])
def dataframe_rad (df, column):
rad[column] = [max(df[column]),min(df[column]),np.mean(df[column]),statistics.median(df[column])]
i = 0
for cl in clusters:
dataframe_rad(cl, 'Census_ProcessorCoreCount')
dataframe_rad(cl, 'Census_TotalPhysicalRAM')
dataframe_rad(cl, 'Census_InternalPrimaryDiagonalDisplaySizeInInches')
dataframe_rad(cl, 'Census_SystemVolumeTotalCapacity')
rad.add_prefix(cl)
display(rad.T)
| MAX | MIN | MEAN | MEDIAN | |
|---|---|---|---|---|
| Census_ProcessorCoreCount | 25.102434 | -1.443917 | 0.063710 | 0.004065 |
| Census_TotalPhysicalRAM | 51.527082 | -1.029218 | 0.069653 | -0.406135 |
| Census_InternalPrimaryDiagonalDisplaySizeInInches | 21.321681 | -1.985934 | 0.030879 | -0.199584 |
| Census_SystemVolumeTotalCapacity | 80.765077 | -1.156684 | 0.021847 | -0.382438 |
| MAX | MIN | MEAN | MEDIAN | |
|---|---|---|---|---|
| Census_ProcessorCoreCount | 21.241146 | -1.443917 | 0.057297 | 0.004065 |
| Census_TotalPhysicalRAM | 51.527082 | -1.127430 | 0.063281 | -0.406135 |
| Census_InternalPrimaryDiagonalDisplaySizeInInches | 12.321879 | -2.122037 | 0.033488 | -0.199584 |
| Census_SystemVolumeTotalCapacity | 33.946575 | -1.126011 | 0.013937 | -0.391531 |
| MAX | MIN | MEAN | MEDIAN | |
|---|---|---|---|---|
| Census_ProcessorCoreCount | 40.547584 | -1.443917 | 0.057410 | 0.004065 |
| Census_TotalPhysicalRAM | 51.527082 | -1.075909 | 0.056811 | -0.406135 |
| Census_InternalPrimaryDiagonalDisplaySizeInInches | 20.879347 | -2.173076 | 0.023148 | -0.199584 |
| Census_SystemVolumeTotalCapacity | 22.248007 | -1.118732 | 0.013888 | -0.391553 |
| MAX | MIN | MEAN | MEDIAN | |
|---|---|---|---|---|
| Census_ProcessorCoreCount | 32.825009 | -1.443917 | 0.054469 | 0.004065 |
| Census_TotalPhysicalRAM | 104.284635 | -1.024589 | 0.059525 | -0.406135 |
| Census_InternalPrimaryDiagonalDisplaySizeInInches | 21.321681 | -1.900870 | 0.031498 | -0.199584 |
| Census_SystemVolumeTotalCapacity | 22.246550 | -1.117965 | 0.011637 | -0.392821 |
**COMMENT**:
We can se that there is a significant difference in the values of Core Processors, RAM and total capacity between Cluster_0 and Cluster_3. Those features can be used as a discriminant to find different type of machines. Although malware represent a menace in the same way for more powerful machines as for the weaker ones. </font>
percent_K4 = pd.DataFrame(index=["PercentageCluster0",
"PercentageCluster1",
"PercentageCluster2",
"PercentageCluster3"])
def dataframe_percent_K4 (df_cluster0,df_cluster1,df_cluster2, df_cluster3, column):
percent_K4[column] = [len(df_cluster0[df_cluster0[column]==1])/df_cluster0.shape[0]*100,
len(df_cluster1[df_cluster1[column]==1])/df_cluster1.shape[0]*100,
len(df_cluster2[df_cluster2[column]==1])/df_cluster2.shape[0]*100,
len(df_cluster3[df_cluster3[column]==1])/df_cluster3.shape[0]*100]
for i in bin_col_inf:
dataframe_percent_K4 (Cluster_0, Cluster_1,Cluster_2, Cluster_3, i)
percent_K4.T.style.apply(highlight_max, props='color:white; background-color:purple;', axis=1)
# axis=1 so in this way we find the higher percentage for each row
| PercentageCluster0 | PercentageCluster1 | PercentageCluster2 | PercentageCluster3 | |
|---|---|---|---|---|
| Wdft_IsGamer | 28.197108 | 27.940407 | 27.987336 | 28.527132 |
| Census_IsTouchEnabled | 12.651299 | 12.753060 | 12.497447 | 12.124031 |
| Census_IsPenCapable | 3.907702 | 3.872875 | 3.703956 | 3.689922 |
| IsSxsPassiveMode | 1.756653 | 1.731535 | 1.794104 | 1.736434 |
| IsProtected | 94.660909 | 94.615434 | 94.896848 | 94.914729 |
| Firewall | 97.796818 | 97.905617 | 97.933547 | 98.139535 |
| Census_HasOpticalDiskDrive | 7.921937 | 7.947985 | 7.796010 | 7.317829 |
| Census_IsSecureBootEnabled | 49.257673 | 49.082207 | 49.104650 | 49.240310 |
| Census_IsAlwaysOnAlwaysConnectedCapable | 5.803754 | 5.845597 | 5.603595 | 6.480620 |
| Replaced | 99.027608 | 99.035783 | 99.077415 | 98.945736 |
From what we can see from this results we can individualize different kind of users (according to different combinantion of data):
There is not so much differene among the different binary columns, but that make sense according to thwe plot that we obtained, where </font>
Let's check if the anti virus enabled can be used as a discriminant for possible clusters
from sklearn import preprocessing
values = np.append(Cluster_0['AVProductsEnabled'].values,Cluster_1['AVProductsEnabled'].values )
values = np.append(values,Cluster_2['AVProductsEnabled'].values )
values = np.append(values,Cluster_3['AVProductsEnabled'].values )
le = preprocessing.LabelEncoder()
le.fit(values)
for cl in clusters:
cl['AVProductsEnabled'] = le.transform(cl['AVProductsEnabled'])
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
for cl in clusters:
print(cl['AVProductsEnabled'].value_counts())
4 83042 3 1139 2 153 1 32 0 4 Name: AVProductsEnabled, dtype: int64 4 91519 3 1298 2 172 1 40 0 1 Name: AVProductsEnabled, dtype: int64 4 54853 3 720 2 108 1 18 Name: AVProductsEnabled, dtype: int64 4 73978 3 993 2 142 1 26 0 3 Name: AVProductsEnabled, dtype: int64
Now we analyse the percentage of every value of AVProductEnabled in every cluster
i = 0
for cl in clusters:
print('Cluster_',i)
print('00: ',len(cl[cl['AVProductsEnabled']==0])/cl.shape[0]*100)
print('01: ',len(cl[cl['AVProductsEnabled']==1])/cl.shape[0]*100)
print('02: ',len(cl[cl['AVProductsEnabled']==2])/cl.shape[0]*100)
print('03: ',len(cl[cl['AVProductsEnabled']==3])/cl.shape[0]*100)
print('04: ',len(cl[cl['AVProductsEnabled']==4])/cl.shape[0]*100)
Cluster_ 0 00: 0.004741021690174233 01: 0.03792817352139386 02: 0.1813440796491644 03: 1.3500059262771127 04: 98.42598079886216 Cluster_ 1 00: 0.0010749220681500592 01: 0.04299688272600236 02: 0.18488659572181018 03: 1.3952488444587767 04: 98.37579275502526 Cluster_ 2 00: 0.0 01: 0.03231655864557712 02: 0.19389935187346272 03: 1.2926623458230848 04: 98.48112174365788 Cluster_ 3 00: 0.003992440978414202 01: 0.03460115514625642 02: 0.1889755396449389 03: 1.321497963855101 04: 98.45093290037529
**COMMENT:**
Antivirus is not relevant for our analysis. Thus we can say that the majority of infected machines has all 4AV.</font>